89a5599d54328adcccbedcf608a1f61e32f59807
[libsub.git] / src / subrip_reader.cc
1 /*
2     Copyright (C) 2014-2020 Carl Hetherington <cth@carlh.net>
3
4     This program is free software; you can redistribute it and/or modify
5     it under the terms of the GNU General Public License as published by
6     the Free Software Foundation; either version 2 of the License, or
7     (at your option) any later version.
8
9     This program is distributed in the hope that it will be useful,
10     but WITHOUT ANY WARRANTY; without even the implied warranty of
11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12     GNU General Public License for more details.
13
14     You should have received a copy of the GNU General Public License
15     along with this program; if not, write to the Free Software
16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17
18 */
19
20 /** @file  src/subrip_reader.cc
21  *  @brief SubripReader class.
22  */
23
24 #include "subrip_reader.h"
25 #include "exceptions.h"
26 #include "util.h"
27 #include "sub_assert.h"
28 #include "raw_convert.h"
29 #include "ssa_reader.h"
30 #include <boost/algorithm/string.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/regex.hpp>
33 #include <boost/bind.hpp>
34 #include <cstdio>
35 #include <vector>
36 #include <iostream>
37
38 using std::string;
39 using std::vector;
40 using std::list;
41 using std::cout;
42 using std::hex;
43 using boost::lexical_cast;
44 using boost::to_upper;
45 using boost::optional;
46 using boost::function;
47 using boost::algorithm::replace_all;
48 using namespace sub;
49
50 /** @param s Subtitle string encoded in UTF-8 */
51 SubripReader::SubripReader (string s)
52 {
53         this->read (boost::bind(&get_line_string, &s));
54 }
55
56 /** @param f Subtitle file encoded in UTF-8 */
57 SubripReader::SubripReader (FILE* f)
58 {
59         this->read (boost::bind (&get_line_file, f));
60 }
61
62 void
63 SubripReader::read (function<optional<string> ()> get_line)
64 {
65         enum {
66                 COUNTER,
67                 METADATA,
68                 CONTENT
69         } state = COUNTER;
70
71         RawSubtitle rs;
72
73         rs.vertical_position.line = 0;
74         rs.vertical_position.reference = TOP_OF_SUBTITLE;
75
76         while (true) {
77                 optional<string> line = get_line ();
78                 if (!line) {
79                         break;
80                 }
81
82                 trim_right_if (*line, boost::is_any_of ("\n\r"));
83                 remove_unicode_bom (line);
84
85                 /* Keep some history in case there is an error to report */
86                 _context.push_back (*line);
87                 if (_context.size() > 5) {
88                         _context.pop_front ();
89                 }
90
91                 switch (state) {
92                 case COUNTER:
93                 {
94                         if (line->empty ()) {
95                                 /* a blank line at the start is ok */
96                                 break;
97                         }
98
99                         state = METADATA;
100
101                         /* Reset stuff that should not persist across separate subtitles */
102                         rs.bold = false;
103                         rs.italic = false;
104                         rs.underline = false;
105                         rs.vertical_position.line = 0;
106                         rs.vertical_position.reference = TOP_OF_SUBTITLE;
107                 }
108                 break;
109                 case METADATA:
110                 {
111                         vector<string> p;
112
113                         /* Further trim this line, removing spaces from the end */
114                         trim_right_if (*line, boost::is_any_of (" "));
115
116                         boost::algorithm::split (p, *line, boost::algorithm::is_any_of (" "), boost::token_compress_on);
117                         if (p.size() != 3 && p.size() != 7) {
118                                 for (int i = 0; i < 2; ++i) {
119                                         optional<string> ex = get_line ();
120                                         if (ex) {
121                                                 _context.push_back (*ex);
122                                         }
123                                 }
124                                 throw SubripError (*line, "a time/position line", _context);
125                         }
126
127                         rs.from = convert_time (p[0]);
128                         rs.to = convert_time (p[2]);
129
130                         /* XXX: should not ignore coordinate specifications */
131
132                         state = CONTENT;
133                         break;
134                 }
135                 case CONTENT:
136                         if (line->empty ()) {
137                                 state = COUNTER;
138                         } else {
139                                 convert_line (*line, rs);
140                                 rs.vertical_position.line = rs.vertical_position.line.get() + 1;
141                         }
142                         break;
143                 }
144         }
145 }
146
147 Time
148 SubripReader::convert_time (string t)
149 {
150         vector<string> a;
151         boost::algorithm::split (a, t, boost::is_any_of (":"));
152         if (a.size() != 3) {
153                 throw SubripError (t, "time in the format h:m:s,ms", _context);
154         }
155
156         vector<string> b;
157         boost::algorithm::split (b, a[2], boost::is_any_of (","));
158         if (b.size() != 2) {
159                 throw SubripError (t, "time in the format h:m:s,ms", _context);
160         }
161
162         int h, m, s, ms;
163
164         try {
165                 h = lexical_cast<int>(a[0]);
166         } catch (boost::bad_lexical_cast &) {
167                 throw SubripError (t, "integer hour value", _context);
168         }
169
170         try {
171                 m = lexical_cast<int>(a[1]);
172         } catch (boost::bad_lexical_cast &) {
173                 throw SubripError (t, "integer minute value", _context);
174         }
175
176         try {
177                 s = lexical_cast<int>(b[0]);
178         } catch (boost::bad_lexical_cast &) {
179                 throw SubripError (t, "integer second value", _context);
180         }
181
182         try {
183                 ms = lexical_cast<int>(b[1]);
184         } catch (boost::bad_lexical_cast &) {
185                 throw SubripError (t, "integer millisecond value", _context);
186         }
187
188         return Time::from_hms (h, m, s, ms);
189 }
190
191 void
192 SubripReader::convert_line (string t, RawSubtitle& p)
193 {
194         enum {
195                 TEXT,
196                 TAG
197         } state = TEXT;
198
199         string tag;
200
201         list<Colour> colours;
202         colours.push_back (Colour (1, 1, 1));
203
204         /* XXX: missing <font> support */
205         /* XXX: nesting of tags e.g. <b>foo<i>bar<b>baz</b>fred</i>jim</b> might
206            not work, I think.
207         */
208
209         for (size_t i = 0; i < t.size(); ++i) {
210                 switch (state) {
211                 case TEXT:
212                         if (t[i] == '<' || t[i] == '{') {
213                                 state = TAG;
214                         } else {
215                                 p.text += t[i];
216                         }
217                         break;
218                 case TAG:
219                         if (t[i] == '>' || t[i] == '}') {
220                                 if (tag == "b") {
221                                         maybe_content (p);
222                                         p.bold = true;
223                                 } else if (tag == "/b") {
224                                         maybe_content (p);
225                                         p.bold = false;
226                                 } else if (tag == "i") {
227                                         maybe_content (p);
228                                         p.italic = true;
229                                 } else if (tag == "/i") {
230                                         maybe_content (p);
231                                         p.italic = false;
232                                 } else if (tag == "u") {
233                                         maybe_content (p);
234                                         p.underline = true;
235                                 } else if (tag == "/u") {
236                                         maybe_content (p);
237                                         p.underline = false;
238                                 } else if (boost::starts_with (tag, "font")) {
239                                         maybe_content (p);
240                                         boost::regex re (".*color=\"?#([[:xdigit:]]+)\"?");
241                                         boost::smatch match;
242                                         if (boost::regex_search (tag, match, re) && string (match[1]).size() == 6) {
243                                                 p.colour = Colour::from_rgb_hex (match[1]);
244                                                 colours.push_back (p.colour);
245                                         } else {
246                                                 re = boost::regex (
247                                                         ".*color=\"rgba\\("
248                                                         "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
249                                                         "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
250                                                         "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
251                                                         "[[:space:]]*([[:digit:]]+)[[:space:]]*"
252                                                         "\\)\""
253                                                         );
254                                                 if (boost::regex_search (tag, match, re) && match.size() == 5) {
255                                                         p.colour.r = raw_convert<int>(string(match[1])) / 255.0;
256                                                         p.colour.g = raw_convert<int>(string(match[2])) / 255.0;
257                                                         p.colour.b = raw_convert<int>(string(match[3])) / 255.0;
258                                                         colours.push_back (p.colour);
259                                                 } else {
260                                                         throw SubripError (tag, "a colour in the format #rrggbb or rgba(rr,gg,bb,aa)", _context);
261                                                 }
262                                         }
263                                 } else if (tag == "/font") {
264                                         maybe_content (p);
265                                         SUB_ASSERT (!colours.empty());
266                                         colours.pop_back ();
267                                         p.colour = colours.back ();
268                                 } else if (tag.size() > 0 && tag[0] == '\\') {
269                                         SSAReader::parse_style (p, tag, 288, 288);
270                                 }
271                                 tag.clear ();
272                                 state = TEXT;
273                         } else {
274                                 tag += tolower (t[i]);
275                         }
276                         break;
277                 }
278         }
279
280         /* Strip Unicode U+202B (right-to-left embedding) as sometimes it is rendered
281            as a missing character.  This may be a hack.
282         */
283         replace_all (p.text, "\xe2\x80\xab", "");
284
285         maybe_content (p);
286 }
287
288 /* Push p into _subs if it has some text, and clear the text out of p */
289 void
290 SubripReader::maybe_content (RawSubtitle& p)
291 {
292         if (!p.text.empty ()) {
293                 _subs.push_back (p);
294                 p.text.clear ();
295         }
296 }