More STL binary reading stuff.
[libsub.git] / src / stl_binary_reader.cc
1 /*
2     Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18 */
19
20 #include <boost/lexical_cast.hpp>
21 #include <boost/algorithm/string.hpp>
22 #include <boost/locale.hpp>
23 #include "stl_binary_reader.h"
24 #include "exceptions.h"
25 #include "iso6937.h"
26 #include "compose.hpp"
27
28 using std::map;
29 using std::cout;
30 using std::string;
31 using std::istream;
32 using boost::lexical_cast;
33 using boost::algorithm::replace_all;
34 using boost::locale::conv::utf_to_utf;
35 using namespace sub;
36
37 template <class E, class F>
38 E
39 file_to_enum (F k, map<F, STLCode<E> > m, string name)
40 {
41         typename map<F, STLCode<E> >::const_iterator i = m.find (k);
42         if (i == m.end ()) {
43                 throw STLError (String::compose ("Unknown %1 %2 in binary STL file", name, k));
44         }
45
46         return i->second.value;
47 }
48
49 template <class E, class F>
50 string
51 enum_to_description (E v, map<F, STLCode<E> > const & m)
52 {
53         for (typename map<F, STLCode<E> >::const_iterator i = m.begin(); i != m.end(); ++i) {
54                 if (i->second.value == v) {
55                         return i->second.description;
56                 }
57         }
58
59         return "";
60 }
61
62 template <class E, class F>
63 void
64 code (map<F, STLCode<E> >& m, F k, E v, string d)
65 {
66         m[k] = STLCode<E> (v, d);
67 }
68
69
70 STLBinaryReader::STLBinaryReader (istream& in)
71         : _buffer (new unsigned char[1024])
72 {
73         create_tables ();
74         
75         in.read ((char *) _buffer, 1024);
76         if (in.gcount() != 1024) {
77                 throw STLError ("Could not read GSI block from binary STL file");
78         }
79                 
80         code_page_number = atoi (get_string (0, 3).c_str ());
81         
82         string const dfc = get_string (3, 8);
83         if (dfc == "STL24.01") {
84                 frame_rate = 24;
85         } else if (dfc == "STL25.01") {
86                 frame_rate = 25;
87         } else if (dfc == "STL30.01") {
88                 frame_rate = 30;
89         } else {
90                 throw STLError (String::compose ("Unknown disk format code %1 in binary STL file", dfc));
91         }
92
93         display_standard = file_to_enum (get_string (11, 1), _display_standard_map, "display standard code");
94         language_group = file_to_enum (get_string (12, 2), _language_group_map, "character code");
95         language = file_to_enum (get_string (14, 2), _language_map, "language code");
96         original_programme_title = get_string (16, 32);
97         original_episode_title = get_string (48, 32);
98         translated_programme_title = get_string (80, 32);
99         translated_episode_title = get_string (112, 32);
100         translator_name = get_string (144, 32);
101         translator_contact_details = get_string (176, 32);
102         subtitle_list_reference_code = get_string (208, 32);
103         creation_date = get_string (224, 6);
104         revision_date = get_string (230, 6);
105         revision_number = get_string (236, 2);
106
107         tti_blocks = atoi (get_string (238, 5).c_str ());
108         number_of_subtitles = atoi (get_string (243, 5).c_str ());
109         subtitle_groups = atoi (get_string (248, 3).c_str ());
110         maximum_characters = atoi (get_string (251, 2).c_str ());
111         maximum_rows = atoi (get_string (253, 2).c_str ());
112         timecode_status = file_to_enum (get_string (255, 1), _timecode_status_map, "timecode status code");
113         start_of_programme = get_string (256, 8);
114         first_in_cue = get_string (256, 8);
115         disks = atoi (get_string (272, 1).c_str ());
116         disk_sequence_number = atoi (get_string (273, 1).c_str ());
117         country_of_origin = get_string (274, 3);
118         publisher = get_string (277, 32);
119         editor_name = get_string (309, 32);
120         editor_contact_details = get_string (341, 32);
121
122         for (int i = 0; i < tti_blocks; ++i) {
123                 Subtitle sub;
124                 
125                 in.read ((char *) _buffer, 128);
126                 if (in.gcount() != 128) {
127                         throw STLError ("Could not read TTI block from binary STL file");
128                 }
129
130                 if (file_to_enum (get_int (15, 1), _comment_map, "comment flag") == COMMENT_YES) {
131                         continue;
132                 }
133
134                 sub.from.frame = get_timecode (5);
135                 sub.to.frame = get_timecode (9);
136                 sub.line = get_int (13, 1);
137
138                 /* XXX: justification, effects */
139
140                 string s = get_string (16, 112);
141                 
142                 /* 8Ah is a new line */
143                 replace_all (s, "\x8a", "\n");
144
145                 /* 8Fh is unused space, so trim the string to the first instance of that */
146                 size_t unused = s.find_first_of ('\x8f');
147                 if (unused != string::npos) {
148                         s = s.substr (0, unused);
149                 }
150                 
151                 sub.text = utf_to_utf<char> (iso6937_to_utf16 (s.c_str()));
152
153                 _subs.push_back (sub);
154         }
155 }
156
157 STLBinaryReader::~STLBinaryReader ()
158 {
159         delete[] _buffer;
160 }
161
162 string
163 STLBinaryReader::get_string (int offset, int length) const
164 {
165         string s;
166         for (int i = 0; i < length; ++i) {
167                 s += _buffer[offset + i];
168         }
169
170         return s;
171 }
172
173 int
174 STLBinaryReader::get_int (int offset, int length) const
175 {
176         int v = 0;
177         for (int i = 0; i < length; ++i) {
178                 v |= _buffer[offset + i] << (8 * i);
179         }
180
181         return v;
182 }
183
184 FrameTime
185 STLBinaryReader::get_timecode (int offset) const
186 {
187         return FrameTime (_buffer[offset], _buffer[offset + 1], _buffer[offset + 2], _buffer[offset + 3]);
188 }
189
190 map<string, string>
191 STLBinaryReader::metadata () const
192 {
193         map<string, string> m;
194
195         m["Code page number"] = lexical_cast<string> (code_page_number);
196         m["Frame rate"] = lexical_cast<string> (frame_rate);
197         m["Display standard"] = enum_to_description (display_standard, _display_standard_map);
198         m["Language group"] = enum_to_description (language_group, _language_group_map);
199         m["Language"] = enum_to_description (language, _language_map);
200         m["Original programme title"] = original_programme_title;
201         m["Original episode title"] = original_episode_title;
202         m["Translated programme title"] = translated_programme_title;
203         m["Translated episode title"] = translated_episode_title;
204         m["Translator name"] = translator_name;
205         m["Translator contact details"] = translator_contact_details;
206         m["Subtitle list reference code"] = subtitle_list_reference_code;
207         m["Creation date"] = creation_date;
208         m["Revision date"] = revision_date;
209         m["Revision number"] = revision_number;
210         m["TTI blocks"] = lexical_cast<string> (tti_blocks);
211         m["Number of subtitles"] = lexical_cast<string> (number_of_subtitles);
212         m["Subtitle groups"] = lexical_cast<string> (subtitle_groups);
213         m["Maximum characters"] = lexical_cast<string> (maximum_characters);
214         m["Maximum rows"] = lexical_cast<string> (maximum_rows);
215         m["Timecode status"] = enum_to_description (timecode_status, _timecode_status_map);
216         m["Start of programme"] = start_of_programme;
217         m["First in cue"] = first_in_cue;
218         m["Disks"] = lexical_cast<string> (disks);
219         m["Disk sequence number"] = lexical_cast<string> (disk_sequence_number);
220         m["Country of origin"] = country_of_origin;
221         m["Publisher"] = publisher;
222         m["Editor name"] = editor_name;
223         m["Editor contact details"] = editor_contact_details;
224
225         return m;
226 }
227
228 void
229 STLBinaryReader::create_tables ()
230 {
231         code<DisplayStandard, string> (_display_standard_map, " ", DISPLAY_STANDARD_UNDEFINED, "Undefined");
232         code<DisplayStandard, string> (_display_standard_map, "0", DISPLAY_STANDARD_OPEN_SUBTITLING, "Open subtitling");
233         code<DisplayStandard, string> (_display_standard_map, "1", DISPLAY_STANDARD_LEVEL_1_TELETEXT, "Level 1 teletext");
234         code<DisplayStandard, string> (_display_standard_map, "2", DISPLAY_STANDARD_LEVEL_2_TELETEXT, "Level 2 teletext");
235         
236         code<LanguageGroup, string> (_language_group_map, "00", LANGUAGE_GROUP_LATIN, "Latin");
237         code<LanguageGroup, string> (_language_group_map, "01", LANGUAGE_GROUP_LATIN_CYRILLIC, "Latin/Cyrillic");
238         code<LanguageGroup, string> (_language_group_map, "02", LANGUAGE_GROUP_LATIN_ARABIC, "Latin/Arabic");
239         code<LanguageGroup, string> (_language_group_map, "03", LANGUAGE_GROUP_LATIN_GREEK, "Latin/Greek");
240         code<LanguageGroup, string> (_language_group_map, "04", LANGUAGE_GROUP_LATIN_HEBREW, "Latin/Hebrew");
241         
242         code<Language, string> (_language_map, "00", LANGUAGE_UNKNOWN, "Unknown");
243         code<Language, string> (_language_map, "01", LANGUAGE_ALBANIAN, "Albanian");
244         code<Language, string> (_language_map, "02", LANGUAGE_BRETON, "Breton");
245         code<Language, string> (_language_map, "03", LANGUAGE_CATALAN, "Catalan");
246         code<Language, string> (_language_map, "04", LANGUAGE_CROATIAN, "Croatian");
247         code<Language, string> (_language_map, "05", LANGUAGE_WELSH, "Welsh");
248         code<Language, string> (_language_map, "06", LANGUAGE_CZECH, "Czech");
249         code<Language, string> (_language_map, "07", LANGUAGE_DANISH, "Danish");
250         code<Language, string> (_language_map, "08", LANGUAGE_GERMAN, "German");
251         code<Language, string> (_language_map, "09", LANGUAGE_ENGLISH, "English");
252         code<Language, string> (_language_map, "0A", LANGUAGE_SPANISH, "Spanish");
253         code<Language, string> (_language_map, "0B", LANGUAGE_ESPERANTO, "Esperanto");
254         code<Language, string> (_language_map, "0C", LANGUAGE_ESTONIAN, "Estonian");
255         code<Language, string> (_language_map, "0D", LANGUAGE_BASQUE, "Basque");
256         code<Language, string> (_language_map, "0E", LANGUAGE_FAROESE, "Faroese");
257         code<Language, string> (_language_map, "0F", LANGUAGE_FRENCH, "French");
258         code<Language, string> (_language_map, "10", LANGUAGE_FRISIAN, "Frisian");
259         code<Language, string> (_language_map, "11", LANGUAGE_IRISH, "Irish");
260         code<Language, string> (_language_map, "12", LANGUAGE_GAELIC, "Gaelic");
261         code<Language, string> (_language_map, "13", LANGUAGE_GALACIAN, "Galacian");
262         code<Language, string> (_language_map, "14", LANGUAGE_ICELANDIC, "Icelandic");
263         code<Language, string> (_language_map, "15", LANGUAGE_ITALIAN, "Italian");
264         code<Language, string> (_language_map, "16", LANGUAGE_LAPPISH, "Lappish");
265         code<Language, string> (_language_map, "17", LANGUAGE_LATIN, "Latin");
266         code<Language, string> (_language_map, "18", LANGUAGE_LATVIAN, "Latvian");
267         code<Language, string> (_language_map, "19", LANGUAGE_LUXEMBORGIAN, "Luxemborgian");
268         code<Language, string> (_language_map, "1A", LANGUAGE_LITHUANIAN, "Lithuanian");
269         code<Language, string> (_language_map, "1B", LANGUAGE_HUNGARIAN, "Hungarian");
270         code<Language, string> (_language_map, "1C", LANGUAGE_MALTESE, "Maltese");
271         code<Language, string> (_language_map, "1D", LANGUAGE_DUTCH, "Dutch");
272         code<Language, string> (_language_map, "1E", LANGUAGE_NORWEGIAN, "Norwegian");
273         code<Language, string> (_language_map, "1F", LANGUAGE_OCCITAN, "Occitan");
274         code<Language, string> (_language_map, "20", LANGUAGE_POLISH, "Polish");
275         code<Language, string> (_language_map, "21", LANGUAGE_PORTUGESE, "Portugese");
276         code<Language, string> (_language_map, "22", LANGUAGE_ROMANIAN, "Romanian");
277         code<Language, string> (_language_map, "23", LANGUAGE_ROMANSH, "Romansh");
278         code<Language, string> (_language_map, "24", LANGUAGE_SERBIAN, "Serbian");
279         code<Language, string> (_language_map, "25", LANGUAGE_SLOVAK, "Slovak");
280         code<Language, string> (_language_map, "26", LANGUAGE_SLOVENIAN, "Slovenian");
281         code<Language, string> (_language_map, "27", LANGUAGE_FINNISH, "Finnish");
282         code<Language, string> (_language_map, "28", LANGUAGE_SWEDISH, "Swedish");
283         code<Language, string> (_language_map, "29", LANGUAGE_TURKISH, "Turkish");
284         code<Language, string> (_language_map, "2A", LANGUAGE_FLEMISH, "Flemish");
285         code<Language, string> (_language_map, "2B", LANGUAGE_WALLON, "Wallon");
286         code<Language, string> (_language_map, "7F", LANGUAGE_AMHARIC, "Amharic");
287         code<Language, string> (_language_map, "7E", LANGUAGE_ARABIC, "Arabic");
288         code<Language, string> (_language_map, "7D", LANGUAGE_ARMENIAN, "Armenian");
289         code<Language, string> (_language_map, "7C", LANGUAGE_ASSAMESE, "Assamese");
290         code<Language, string> (_language_map, "7B", LANGUAGE_AZERBAIJANI, "Azerbaijani");
291         code<Language, string> (_language_map, "7A", LANGUAGE_BAMBORA, "Bambora");
292         code<Language, string> (_language_map, "79", LANGUAGE_BIELORUSSIAN, "Bielorussian");
293         code<Language, string> (_language_map, "78", LANGUAGE_BENGALI, "Bengali");
294         code<Language, string> (_language_map, "77", LANGUAGE_BULGARIAN, "Bulgarian");
295         code<Language, string> (_language_map, "76", LANGUAGE_BURMESE, "Burmese");
296         code<Language, string> (_language_map, "75", LANGUAGE_CHINESE, "Chinese");
297         code<Language, string> (_language_map, "74", LANGUAGE_CHURASH, "Churash");
298         code<Language, string> (_language_map, "73", LANGUAGE_DARI, "Dari");
299         code<Language, string> (_language_map, "72", LANGUAGE_FULANI, "Fulani");
300         code<Language, string> (_language_map, "71", LANGUAGE_GEORGIAN, "Georgian");
301         code<Language, string> (_language_map, "70", LANGUAGE_GREEK, "Greek");
302         code<Language, string> (_language_map, "6F", LANGUAGE_GUJURATI, "Gujarati");
303         code<Language, string> (_language_map, "6E", LANGUAGE_GURANI, "Gurani");
304         code<Language, string> (_language_map, "6D", LANGUAGE_HAUSA, "Hausa");
305         code<Language, string> (_language_map, "6C", LANGUAGE_HEBREW, "Hebrew");
306         code<Language, string> (_language_map, "6B", LANGUAGE_HINDI, "Hindi");
307         code<Language, string> (_language_map, "6A", LANGUAGE_INDONESIAN, "Indonesian");
308         code<Language, string> (_language_map, "69", LANGUAGE_JAPANESE, "Japanese");
309         code<Language, string> (_language_map, "68", LANGUAGE_KANNADA, "Kannada");
310         code<Language, string> (_language_map, "67", LANGUAGE_KAZAKH, "Kazakh");
311         code<Language, string> (_language_map, "66", LANGUAGE_KHMER, "Khmer");
312         code<Language, string> (_language_map, "65", LANGUAGE_KOREAN, "Korean");
313         code<Language, string> (_language_map, "64", LANGUAGE_LAOTIAN, "Laotian");
314         code<Language, string> (_language_map, "63", LANGUAGE_MACEDONIAN, "Macedonian");
315         code<Language, string> (_language_map, "62", LANGUAGE_MALAGASAY, "Malagasay");
316         code<Language, string> (_language_map, "61", LANGUAGE_MALAYSIAN, "Malaysian");
317         code<Language, string> (_language_map, "60", LANGUAGE_MOLDAVIAN, "Moldavian");
318         code<Language, string> (_language_map, "5F", LANGUAGE_MARATHI, "Marathi");
319         code<Language, string> (_language_map, "5E", LANGUAGE_NDEBELE, "Ndebele");
320         code<Language, string> (_language_map, "5D", LANGUAGE_NEPALI, "Nepali");
321         code<Language, string> (_language_map, "5C", LANGUAGE_ORIYA, "Oriya");
322         code<Language, string> (_language_map, "5B", LANGUAGE_PAPAMIENTO, "Papamiento");
323         code<Language, string> (_language_map, "5A", LANGUAGE_PERSIAN, "Persian");
324         code<Language, string> (_language_map, "59", LANGUAGE_PUNJABI, "Punjabi");
325         code<Language, string> (_language_map, "58", LANGUAGE_PUSHTU, "Pushtu");
326         code<Language, string> (_language_map, "57", LANGUAGE_QUECHUA, "Quechua");
327         code<Language, string> (_language_map, "56", LANGUAGE_RUSSIAN, "Russian");
328         code<Language, string> (_language_map, "55", LANGUAGE_RUTHENIAN, "Ruthenian");
329         code<Language, string> (_language_map, "54", LANGUAGE_SERBO_CROAT, "Serbo Croat");
330         code<Language, string> (_language_map, "53", LANGUAGE_SHONA, "Shona");
331         code<Language, string> (_language_map, "52", LANGUAGE_SINHALESE, "Sinhalese");
332         code<Language, string> (_language_map, "51", LANGUAGE_SOMALI, "Somali");
333         code<Language, string> (_language_map, "50", LANGUAGE_SRANAN_TONGO, "Sranan Tongo");
334         code<Language, string> (_language_map, "4F", LANGUAGE_SWAHILI, "Swahili");
335         code<Language, string> (_language_map, "4E", LANGUAGE_TADZHIK, "Tadzhik");
336         code<Language, string> (_language_map, "4D", LANGUAGE_TAMIL, "Tamil");
337         code<Language, string> (_language_map, "4C", LANGUAGE_TATAR, "Tatar");
338         code<Language, string> (_language_map, "4B", LANGUAGE_TELUGU, "Telugu");
339         code<Language, string> (_language_map, "4A", LANGUAGE_THAI, "Thai");
340         code<Language, string> (_language_map, "49", LANGUAGE_UKRANIAN, "Ukranian");
341         code<Language, string> (_language_map, "48", LANGUAGE_URDU, "Urdu");
342         code<Language, string> (_language_map, "47", LANGUAGE_UZBEK, "Uzbek");
343         code<Language, string> (_language_map, "46", LANGUAGE_VIETNAMESE, "Vietnamese");
344         code<Language, string> (_language_map, "45", LANGUAGE_ZULU, "Zulu");
345
346         code<TimecodeStatus, string> (_timecode_status_map, "0", TIMECODE_STATUS_NOT_INTENDED_FOR_USE, "Not intended for use");
347         code<TimecodeStatus, string> (_timecode_status_map, "1", TIMECODE_STATUS_INTENDED_FOR_USE, "Intended for use");
348
349         code<CumulativeStatus, int> (_cumulative_status_map, 0, CUMULATIVE_STATUS_NOT_CUMULATIVE, "Not part of a cumulative set");
350         code<CumulativeStatus, int> (_cumulative_status_map, 1, CUMULATIVE_STATUS_FIRST, "First subtitle of a cumulative set");
351         code<CumulativeStatus, int> (_cumulative_status_map, 2, CUMULATIVE_STATUS_INTERMEDIATE, "Intermediate subtitle of a cumulative set");
352         code<CumulativeStatus, int> (_cumulative_status_map, 3, CUMULATIVE_STATUS_LAST, "Last subtitle of a cumulative set");
353
354         code<Justification, int> (_justification_map, 0, JUSTIFICATION_NONE, "No justification");
355         code<Justification, int> (_justification_map, 1, JUSTIFICATION_LEFT, "Left justification");
356         code<Justification, int> (_justification_map, 2, JUSTIFICATION_CENTRE, "Centre justification");
357         code<Justification, int> (_justification_map, 3, JUSTIFICATION_CENTRE, "Right justification");
358
359         code<Comment, int> (_comment_map, 0, COMMENT_NO, "Not a comment");
360         code<Comment, int> (_comment_map, 1, COMMENT_YES, "Is a comment");
361 }