Bump libcxml for OS X fix.
[libsub.git] / src / ssa_reader.cc
1 /*
2     Copyright (C) 2016-2019 Carl Hetherington <cth@carlh.net>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18 */
19
20 #include "ssa_reader.h"
21 #include "util.h"
22 #include "sub_assert.h"
23 #include "raw_convert.h"
24 #include "subtitle.h"
25 #include <boost/algorithm/string.hpp>
26 #include <boost/bind.hpp>
27 #include <boost/foreach.hpp>
28 #include <iostream>
29 #include <vector>
30
31 using std::string;
32 using std::vector;
33 using std::map;
34 using std::cout;
35 using std::list;
36 using boost::optional;
37 using boost::function;
38 using namespace boost::algorithm;
39 using namespace sub;
40
41 /** @param s Subtitle string encoded in UTF-8 */
42 SSAReader::SSAReader (string s)
43 {
44         this->read (boost::bind(&get_line_string, &s));
45 }
46
47 /** @param f Subtitle file encoded in UTF-8 */
48 SSAReader::SSAReader (FILE* f)
49 {
50         this->read (boost::bind (&get_line_file, f));
51 }
52
53 class Style
54 {
55 public:
56         Style ()
57                 : font_size (72)
58                 , primary_colour (255, 255, 255)
59                 , bold (false)
60                 , italic (false)
61                 , underline (false)
62                 , horizontal_reference (HORIZONTAL_CENTRE_OF_SCREEN)
63                 , vertical_reference (BOTTOM_OF_SCREEN)
64                 , vertical_margin (0)
65         {}
66
67         Style (string format_line, string style_line)
68                 : font_size (72)
69                 , primary_colour (255, 255, 255)
70                 , bold (false)
71                 , italic (false)
72                 , underline (false)
73                 , horizontal_reference (HORIZONTAL_CENTRE_OF_SCREEN)
74                 , vertical_reference (BOTTOM_OF_SCREEN)
75                 , vertical_margin (0)
76         {
77                 vector<string> keys;
78                 split (keys, format_line, boost::is_any_of (","));
79                 vector<string> style;
80                 split (style, style_line, boost::is_any_of (","));
81
82                 SUB_ASSERT (!keys.empty());
83                 SUB_ASSERT (!style.empty());
84                 SUB_ASSERT (keys.size() == style.size());
85
86                 for (size_t i = 0; i < style.size(); ++i) {
87                         trim (keys[i]);
88                         trim (style[i]);
89                         if (keys[i] == "Name") {
90                                 name = style[i];
91                         } else if (keys[i] == "Fontname") {
92                                 font_name = style[i];
93                         } else if (keys[i] == "Fontsize") {
94                                 font_size = raw_convert<int> (style[i]);
95                         } else if (keys[i] == "PrimaryColour") {
96                                 primary_colour = colour (raw_convert<int> (style[i]));
97                         } else if (keys[i] == "BackColour") {
98                                 back_colour = colour (raw_convert<int> (style[i]));
99                         } else if (keys[i] == "Bold") {
100                                 bold = style[i] == "-1";
101                         } else if (keys[i] == "Italic") {
102                                 italic = style[i] == "-1";
103                         } else if (keys[i] == "Underline") {
104                                 underline = style[i] == "-1";
105                         } else if (keys[i] == "BorderStyle") {
106                                 if (style[i] == "1") {
107                                         effect = SHADOW;
108                                 }
109                         } else if (keys[i] == "Alignment") {
110                                 /* These values from libass' source code */
111                                 switch ((raw_convert<int> (style[i]) - 1) % 3) {
112                                 case 0:
113                                         horizontal_reference = LEFT_OF_SCREEN;
114                                         break;
115                                 case 1:
116                                         horizontal_reference = HORIZONTAL_CENTRE_OF_SCREEN;
117                                         break;
118                                 case 2:
119                                         horizontal_reference = RIGHT_OF_SCREEN;
120                                         break;
121                                 }
122                                 switch (raw_convert<int> (style[i]) & 12) {
123                                 case 4:
124                                         vertical_reference = TOP_OF_SCREEN;
125                                         break;
126                                 case 8:
127                                         vertical_reference = VERTICAL_CENTRE_OF_SCREEN;
128                                         break;
129                                 case 0:
130                                         vertical_reference = BOTTOM_OF_SCREEN;
131                                         break;
132                                 }
133                         } else if (keys[i] == "MarginV") {
134                                 vertical_margin = raw_convert<int> (style[i]);
135                         }
136                 }
137         }
138
139         string name;
140         optional<string> font_name;
141         int font_size;
142         Colour primary_colour;
143         /** outline colour */
144         optional<Colour> back_colour;
145         bool bold;
146         bool italic;
147         bool underline;
148         optional<Effect> effect;
149         HorizontalReference horizontal_reference;
150         VerticalReference vertical_reference;
151         int vertical_margin;
152
153 private:
154         Colour colour (int c) const
155         {
156                 return Colour (
157                         ((c & 0x0000ff) >>  0) / 255.0,
158                         ((c & 0x00ff00) >>  8) / 255.0,
159                         ((c & 0xff0000) >> 16) / 255.0
160                         );
161         }
162 };
163
164 Time
165 SSAReader::parse_time (string t) const
166 {
167         vector<string> bits;
168         split (bits, t, is_any_of (":."));
169         SUB_ASSERT (bits.size() == 4);
170         return Time::from_hms (
171                 raw_convert<int> (bits[0]),
172                 raw_convert<int> (bits[1]),
173                 raw_convert<int> (bits[2]),
174                 raw_convert<int> (bits[3]) * 10
175                 );
176 }
177
178 /** @param base RawSubtitle filled in with any required common values.
179  *  @param line SSA line string (i.e. just the subtitle, possibly with embedded stuff)
180  *  @return List of RawSubtitles to represent line with vertical reference TOP_OF_SUBTITLE.
181  */
182 list<RawSubtitle>
183 SSAReader::parse_line (RawSubtitle base, string line, int play_res_x, int play_res_y)
184 {
185         enum {
186                 TEXT,
187                 STYLE,
188                 BACKSLASH
189         } state = TEXT;
190
191         list<RawSubtitle> subs;
192         RawSubtitle current = base;
193         string style;
194
195         if (!current.vertical_position.reference) {
196                 current.vertical_position.reference = BOTTOM_OF_SCREEN;
197         }
198
199         if (!current.vertical_position.proportional) {
200                 current.vertical_position.proportional = 0;
201         }
202
203         /* We must have a font size, as there could be a margin specified
204            in pixels and in that case we must know how big the subtitle
205            lines are to work out the position on screen.
206         */
207         if (!current.font_size.points()) {
208                 current.font_size.set_points (72);
209         }
210
211         /* Count the number of line breaks */
212         int line_breaks = 0;
213         if (line.length() > 0) {
214                 for (size_t i = 0; i < line.length() - 1; ++i) {
215                         if (line[i] == '\\' && (line[i+1] == 'n' || line[i+1] == 'N')) {
216                                 ++line_breaks;
217                         }
218                 }
219         }
220
221         /* Imagine that the screen is 792 points (i.e. 11 inches) high (as with DCP) */
222         double const line_size = current.font_size.proportional(792) * 1.2;
223
224         /* Tweak vertical_position accordingly */
225         switch (current.vertical_position.reference.get()) {
226         case TOP_OF_SCREEN:
227         case TOP_OF_SUBTITLE:
228                 /* Nothing to do */
229                 break;
230         case VERTICAL_CENTRE_OF_SCREEN:
231                 current.vertical_position.proportional = current.vertical_position.proportional.get() - ((line_breaks + 1) * line_size) / 2;
232                 break;
233         case BOTTOM_OF_SCREEN:
234                 current.vertical_position.proportional = current.vertical_position.proportional.get() + line_breaks * line_size;
235                 break;
236         }
237
238         for (size_t i = 0; i < line.length(); ++i) {
239                 char const c = line[i];
240                 switch (state) {
241                 case TEXT:
242                         if (c == '{') {
243                                 state = STYLE;
244                         } else if (c == '\\') {
245                                 state = BACKSLASH;
246                         } else if (c != '\r' && c != '\n') {
247                                 current.text += c;
248                         }
249                         break;
250                 case STYLE:
251                         if (c == '}' || c == '\\') {
252                                 if (!current.text.empty ()) {
253                                         subs.push_back (current);
254                                         current.text = "";
255                                 }
256                                 if (style == "\\i1") {
257                                         current.italic = true;
258                                 } else if (style == "\\i0" || style == "\\i") {
259                                         current.italic = false;
260                                 } else if (style == "\\b1") {
261                                         current.bold = true;
262                                 } else if (style == "\\b0") {
263                                         current.bold = false;
264                                 } else if (style == "\\u1") {
265                                         current.underline = true;
266                                 } else if (style == "\\u0") {
267                                         current.underline = false;
268                                 } else if (style == "\\an1" || style == "\\an2" || style == "\\an3") {
269                                         current.vertical_position.reference = sub::BOTTOM_OF_SCREEN;
270                                 } else if (style == "\\an4" || style == "\\an5" || style == "\\an6") {
271                                         current.vertical_position.reference = sub::VERTICAL_CENTRE_OF_SCREEN;
272                                 } else if (style == "\\an7" || style == "\\an8" || style == "\\an9") {
273                                         current.vertical_position.reference = sub::TOP_OF_SCREEN;
274                                 } else if (boost::starts_with(style, "\\pos")) {
275                                         vector<string> bits;
276                                         boost::algorithm::split (bits, style, boost::is_any_of("(,"));
277                                         SUB_ASSERT (bits.size() == 3);
278                                         current.horizontal_position.reference = sub::LEFT_OF_SCREEN;
279                                         current.horizontal_position.proportional = raw_convert<float>(bits[1]) / play_res_x;
280                                         current.vertical_position.reference = sub::TOP_OF_SCREEN;
281                                         current.vertical_position.proportional = raw_convert<float>(bits[2]) / play_res_y;
282                                 } else if (boost::starts_with(style, "\\fs")) {
283                                         SUB_ASSERT (style.length() > 3);
284                                         current.font_size.set_points (raw_convert<int>(style.substr(3)));
285                                 }
286                                 style = "";
287                         }
288
289                         if (c == '}') {
290                                 state = TEXT;
291                         } else {
292                                 style += c;
293                         }
294                         break;
295                 case BACKSLASH:
296                         if (c == 'n' || c == 'N') {
297                                 if (!current.text.empty ()) {
298                                         subs.push_back (current);
299                                         current.text = "";
300                                 }
301                                 /* Move down one line (1.2 times the font size) */
302                                 if (current.vertical_position.reference.get() == BOTTOM_OF_SCREEN) {
303                                         current.vertical_position.proportional = current.vertical_position.proportional.get() - line_size;
304                                 } else {
305                                         current.vertical_position.proportional = current.vertical_position.proportional.get() + line_size;
306                                 }
307                         }
308                         state = TEXT;
309                         break;
310                 }
311         }
312
313         if (!current.text.empty ()) {
314                 subs.push_back (current);
315         }
316
317         return subs;
318 }
319
320 void
321 SSAReader::read (function<optional<string> ()> get_line)
322 {
323         enum {
324                 INFO,
325                 STYLES,
326                 EVENTS
327         } part = INFO;
328
329         int play_res_x = 288;
330         int play_res_y = 288;
331         map<string, Style> styles;
332         string style_format_line;
333         vector<string> event_format;
334
335         while (true) {
336                 optional<string> line = get_line ();
337                 if (!line) {
338                         break;
339                 }
340
341                 trim (*line);
342                 remove_unicode_bom (line);
343
344                 if (starts_with (*line, ";") || line->empty ()) {
345                         continue;
346                 }
347
348                 if (starts_with (*line, "[")) {
349                         /* Section heading */
350                         if (line.get() == "[Script Info]") {
351                                 part = INFO;
352                         } else if (line.get() == "[V4 Styles]" || line.get() == "[V4+ Styles]") {
353                                 part = STYLES;
354                         } else if (line.get() == "[Events]") {
355                                 part = EVENTS;
356                         }
357                         continue;
358                 }
359
360                 size_t const colon = line->find (":");
361                 SUB_ASSERT (colon != string::npos);
362                 string const type = line->substr (0, colon);
363                 string body = line->substr (colon + 1);
364                 trim (body);
365
366                 switch (part) {
367                 case INFO:
368                         if (type == "PlayResX") {
369                                 play_res_x = raw_convert<int> (body);
370                         } else if (type == "PlayResY") {
371                                 play_res_y = raw_convert<int> (body);
372                         }
373                         break;
374                 case STYLES:
375                         if (type == "Format") {
376                                 style_format_line = body;
377                         } else if (type == "Style") {
378                                 SUB_ASSERT (!style_format_line.empty ());
379                                 Style s (style_format_line, body);
380                                 styles[s.name] = s;
381                         }
382                         break;
383                 case EVENTS:
384                         if (type == "Format") {
385                                 split (event_format, body, is_any_of (","));
386                                 BOOST_FOREACH (string& i, event_format) {
387                                         trim (i);
388                                 }
389                         } else if (type == "Dialogue") {
390                                 SUB_ASSERT (!event_format.empty ());
391                                 vector<string> event;
392                                 split (event, body, is_any_of (","));
393
394                                 /* There may be commas in the subtitle part; reassemble any extra parts
395                                    from when we just split it.
396                                 */
397                                 while (event.size() > event_format.size()) {
398                                         string const ex = event.back ();
399                                         event.pop_back ();
400                                         event.back() += "," + ex;
401                                 }
402
403                                 SUB_ASSERT (!event.empty());
404                                 SUB_ASSERT (event_format.size() == event.size());
405
406                                 RawSubtitle sub;
407
408                                 for (size_t i = 0; i < event.size(); ++i) {
409                                         trim (event[i]);
410                                         if (event_format[i] == "Start") {
411                                                 sub.from = parse_time (event[i]);
412                                         } else if (event_format[i] == "End") {
413                                                 sub.to = parse_time (event[i]);
414                                         } else if (event_format[i] == "Style") {
415                                                 /* libass trims leading '*'s from style names, commenting that
416                                                    "they seem to mean literally nothing".  Go figure...
417                                                 */
418                                                 trim_left_if (event[i], boost::is_any_of ("*"));
419                                                 SUB_ASSERT (styles.find(event[i]) != styles.end());
420                                                 Style style = styles[event[i]];
421                                                 sub.font = style.font_name;
422                                                 sub.font_size = FontSize::from_points (style.font_size);
423                                                 sub.colour = style.primary_colour;
424                                                 sub.effect_colour = style.back_colour;
425                                                 sub.bold = style.bold;
426                                                 sub.italic = style.italic;
427                                                 sub.underline = style.underline;
428                                                 sub.effect = style.effect;
429                                                 sub.horizontal_position.reference = style.horizontal_reference;
430                                                 sub.vertical_position.reference = style.vertical_reference;
431                                                 sub.vertical_position.proportional = float(style.vertical_margin) / play_res_y;
432                                         } else if (event_format[i] == "MarginV") {
433                                                 sub.vertical_position.proportional = raw_convert<float>(event[i]) / play_res_y;
434                                         } else if (event_format[i] == "Text") {
435                                                 BOOST_FOREACH (sub::RawSubtitle j, parse_line (sub, event[i], play_res_x, play_res_y)) {
436                                                         _subs.push_back (j);
437                                                 }
438                                         }
439                                 }
440                         }
441                 }
442
443         }
444 }