X-Git-Url: https://main.carlh.net/gitweb/?a=blobdiff_plain;f=src%2Fsubrip_reader.cc;h=8ba7c7ddc51f08f2434541d3d07d9fc6fffd129b;hb=e3d8790ae7c9f8dbbcc9cd8a1fa5c0fede26b872;hp=134ca3e7ccd08c380df38cc6fe4418e0fbcc9547;hpb=4d406c620b0211a5e27c19187d963241120f8838;p=libsub.git diff --git a/src/subrip_reader.cc b/src/subrip_reader.cc index 134ca3e..8ba7c7d 100644 --- a/src/subrip_reader.cc +++ b/src/subrip_reader.cc @@ -1,5 +1,5 @@ /* - Copyright (C) 2014 Carl Hetherington + Copyright (C) 2014-2020 Carl Hetherington This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -17,24 +17,49 @@ */ +/** @file src/subrip_reader.cc + * @brief SubripReader class. + */ + #include "subrip_reader.h" #include "exceptions.h" +#include "util.h" +#include "sub_assert.h" +#include "raw_convert.h" +#include "ssa_reader.h" #include #include #include +#include #include #include +#include using std::string; using std::vector; -using std::list; using std::cout; using std::hex; using boost::lexical_cast; using boost::to_upper; +using boost::optional; +using boost::function; +using boost::algorithm::replace_all; using namespace sub; +/** @param s Subtitle string encoded in UTF-8 */ +SubripReader::SubripReader (string s) +{ + this->read (boost::bind(&get_line_string, &s)); +} + +/** @param f Subtitle file encoded in UTF-8 */ SubripReader::SubripReader (FILE* f) +{ + this->read (boost::bind (&get_line_file, f)); +} + +void +SubripReader::read (function ()> get_line) { enum { COUNTER, @@ -42,55 +67,64 @@ SubripReader::SubripReader (FILE* f) CONTENT } state = COUNTER; - char buffer[256]; - - Time from; - Time to; + RawSubtitle rs; - string line; - int line_number = 0; + rs.vertical_position.line = 0; + rs.vertical_position.reference = TOP_OF_SUBTITLE; - while (!feof (f)) { - char* r = fgets (buffer, sizeof (buffer), f); - if (r == 0 || feof (f)) { + while (true) { + auto line = get_line (); + if (!line) { break; } - line = string (buffer); - trim_right_if (line, boost::is_any_of ("\n\r")); + trim_right_if (*line, boost::is_any_of ("\n\r")); + remove_unicode_bom (line); - if ( - line.length() >= 3 && - static_cast (line[0]) == 0xef && - static_cast (line[1]) == 0xbb && - static_cast (line[2]) == 0xbf - ) { - - /* Skip Unicode byte order mark */ - line = line.substr (3); + /* Keep some history in case there is an error to report */ + _context.push_back (*line); + if (_context.size() > 5) { + _context.pop_front (); } switch (state) { case COUNTER: { - if (line.empty ()) { + if (line->empty ()) { /* a blank line at the start is ok */ break; } state = METADATA; + + /* Reset stuff that should not persist across separate subtitles */ + rs.bold = false; + rs.italic = false; + rs.underline = false; + rs.vertical_position.line = 0; + rs.vertical_position.reference = TOP_OF_SUBTITLE; } break; case METADATA: { vector p; - boost::algorithm::split (p, line, boost::algorithm::is_any_of (" ")); + + /* Further trim this line, removing spaces from the end */ + trim_right_if (*line, boost::is_any_of (" ")); + + boost::algorithm::split (p, *line, boost::algorithm::is_any_of (" "), boost::token_compress_on); if (p.size() != 3 && p.size() != 7) { - throw SubripError (line, "a time/position line"); + for (int i = 0; i < 2; ++i) { + optional ex = get_line (); + if (ex) { + _context.push_back (*ex); + } + } + throw SubripError (*line, "a time/position line", _context); } - from = convert_time (p[0]); - to = convert_time (p[2]); + rs.from = convert_time (p[0]); + rs.to = convert_time (p[2]); /* XXX: should not ignore coordinate specifications */ @@ -98,12 +132,11 @@ SubripReader::SubripReader (FILE* f) break; } case CONTENT: - if (line.empty ()) { + if (line->empty ()) { state = COUNTER; - line_number = 0; } else { - convert_line (line, line_number, from, to); - line_number++; + convert_line (*line, rs); + rs.vertical_position.line = rs.vertical_position.line.get() + 1; } break; } @@ -116,22 +149,46 @@ SubripReader::convert_time (string t) vector a; boost::algorithm::split (a, t, boost::is_any_of (":")); if (a.size() != 3) { - throw SubripError (t, "time in the format h:m:s,ms"); + throw SubripError (t, "time in the format h:m:s,ms", _context); } vector b; boost::algorithm::split (b, a[2], boost::is_any_of (",")); + if (b.size() != 2) { + throw SubripError (t, "time in the format h:m:s,ms", _context); + } + + int h, m, s, ms; + + try { + h = lexical_cast(a[0]); + } catch (boost::bad_lexical_cast &) { + throw SubripError (t, "integer hour value", _context); + } + + try { + m = lexical_cast(a[1]); + } catch (boost::bad_lexical_cast &) { + throw SubripError (t, "integer minute value", _context); + } - return Time::from_hms ( - lexical_cast (a[0]), - lexical_cast (a[1]), - lexical_cast (b[0]), - lexical_cast (b[1]) - ); + try { + s = lexical_cast(b[0]); + } catch (boost::bad_lexical_cast &) { + throw SubripError (t, "integer second value", _context); + } + + try { + ms = lexical_cast(b[1]); + } catch (boost::bad_lexical_cast &) { + throw SubripError (t, "integer millisecond value", _context); + } + + return Time::from_hms (h, m, s, ms); } void -SubripReader::convert_line (string t, int line_number, Time from, Time to) +SubripReader::convert_line (string t, RawSubtitle& p) { enum { TEXT, @@ -140,17 +197,7 @@ SubripReader::convert_line (string t, int line_number, Time from, Time to) string tag; - RawSubtitle p; - p.font = "Arial"; - p.font_size.set_points (48); - p.from = from; - p.to = to; - p.vertical_position.line = line_number; - /* XXX: arbitrary */ - p.vertical_position.lines = 32; - p.vertical_position.reference = TOP_OF_SUBTITLE; - - list colours; + vector colours; colours.push_back (Colour (1, 1, 1)); /* XXX: missing support */ @@ -189,28 +236,55 @@ SubripReader::convert_line (string t, int line_number, Time from, Time to) p.underline = false; } else if (boost::starts_with (tag, "font")) { maybe_content (p); - boost::regex re (".*color=\"#([0123456789abcdef]+)\""); + boost::regex re (".*color=\"?#([[:xdigit:]]+)\"?"); boost::smatch match; if (boost::regex_search (tag, match, re) && string (match[1]).size() == 6) { p.colour = Colour::from_rgb_hex (match[1]); colours.push_back (p.colour); + } else { + re = boost::regex ( + ".*color=\"rgba\\(" + "[[:space:]]*([[:digit:]]+)[[:space:]]*," + "[[:space:]]*([[:digit:]]+)[[:space:]]*," + "[[:space:]]*([[:digit:]]+)[[:space:]]*," + "[[:space:]]*([[:digit:]]+)[[:space:]]*" + "\\)\"" + ); + if (boost::regex_search (tag, match, re) && match.size() == 5) { + p.colour.r = raw_convert(string(match[1])) / 255.0; + p.colour.g = raw_convert(string(match[2])) / 255.0; + p.colour.b = raw_convert(string(match[3])) / 255.0; + colours.push_back (p.colour); + } else { + throw SubripError (tag, "a colour in the format #rrggbb or rgba(rr,gg,bb,aa)", _context); + } } } else if (tag == "/font") { + maybe_content (p); + SUB_ASSERT (!colours.empty()); colours.pop_back (); p.colour = colours.back (); + } else if (tag.size() > 0 && tag[0] == '\\') { + SSAReader::parse_style (p, tag, 288, 288); } tag.clear (); state = TEXT; } else { - tag += t[i]; + tag += tolower (t[i]); } break; } } + /* Strip Unicode U+202B (right-to-left embedding) as sometimes it is rendered + as a missing character. This may be a hack. + */ + replace_all (p.text, "\xe2\x80\xab", ""); + maybe_content (p); } +/* Push p into _subs if it has some text, and clear the text out of p */ void SubripReader::maybe_content (RawSubtitle& p) {