Handle angle brackets / tags better in SubRip files.
authorCarl Hetherington <cth@carlh.net>
Sun, 1 Aug 2021 00:01:06 +0000 (02:01 +0200)
committerCarl Hetherington <cth@carlh.net>
Sun, 1 Aug 2021 00:01:06 +0000 (02:01 +0200)
Before we would assume that < starts a tag, and so parse things
like << some text >> incorrectly.  Now we search for the tags
we are interested in and pass anything else through.

src/subrip_reader.cc
test/subrip_reader_test.cc

index d8972c14b9f774f933aef782634ec8fefe9c9acc..94e1383185208f1e74effec288455974997a46a0 100644 (file)
@@ -190,84 +190,88 @@ SubripReader::convert_time (string t)
 void
 SubripReader::convert_line (string t, RawSubtitle& p)
 {
-       enum {
-               TEXT,
-               TAG
-       } state = TEXT;
-
-       string tag;
-
        vector<Colour> colours;
        colours.push_back (Colour (1, 1, 1));
 
-       for (size_t i = 0; i < t.size(); ++i) {
-               switch (state) {
-               case TEXT:
-                       if (t[i] == '<' || t[i] == '{') {
-                               state = TAG;
-                       } else {
-                               p.text += t[i];
+       auto has_next = [](string line, size_t& index, string s) {
+               boost::to_lower(s);
+               auto next = line.substr(index, s.size());
+               boost::to_lower(next);
+               if (next != s) {
+                       return false;
+               }
+
+               index += s.size();
+               return true;
+       };
+
+       size_t i = 0;
+       while (i < t.size()) {
+               if (has_next(t, i, "<b>") || has_next(t, i, "{b}")) {
+                       maybe_content (p);
+                       p.bold = true;
+               } else if (has_next(t, i, "</b>") || has_next(t, i, "{/b}")) {
+                       maybe_content (p);
+                       p.bold = false;
+               } else if (has_next(t, i, "<i>") || has_next(t, i, "{i}")) {
+                       maybe_content (p);
+                       p.italic = true;
+               } else if (has_next(t, i, "</i>") || has_next(t, i, "{/i}")) {
+                       maybe_content (p);
+                       p.italic = false;
+               } else if (has_next(t, i, "<u>") || has_next(t, i, "{u}")) {
+                       maybe_content (p);
+                       p.underline = true;
+               } else if (has_next(t, i, "</u>") || has_next(t, i, "{/u}")) {
+                       maybe_content (p);
+                       p.underline = false;
+               } else if (has_next(t, i, "<font") || has_next(t, i, "<Font")) {
+                       maybe_content (p);
+                       boost::regex re (".*color=\"?#([[:xdigit:]]+)\"?");
+                       boost::smatch match;
+                       string tag;
+                       while (i < t.size() && t[i] != '>') {
+                               tag += t[i];
+                               ++i;
                        }
-                       break;
-               case TAG:
-                       if (t[i] == '>' || t[i] == '}') {
-                               if (tag == "b") {
-                                       maybe_content (p);
-                                       p.bold = true;
-                               } else if (tag == "/b") {
-                                       maybe_content (p);
-                                       p.bold = false;
-                               } else if (tag == "i") {
-                                       maybe_content (p);
-                                       p.italic = true;
-                               } else if (tag == "/i") {
-                                       maybe_content (p);
-                                       p.italic = false;
-                               } else if (tag == "u") {
-                                       maybe_content (p);
-                                       p.underline = true;
-                               } else if (tag == "/u") {
-                                       maybe_content (p);
-                                       p.underline = false;
-                               } else if (boost::starts_with (tag, "font")) {
-                                       maybe_content (p);
-                                       boost::regex re (".*color=\"?#([[:xdigit:]]+)\"?");
-                                       boost::smatch match;
-                                       if (boost::regex_search (tag, match, re) && string (match[1]).size() == 6) {
-                                               p.colour = Colour::from_rgb_hex (match[1]);
-                                               colours.push_back (p.colour);
-                                       } else {
-                                               re = boost::regex (
-                                                       ".*color=\"rgba\\("
-                                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
-                                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
-                                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
-                                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*"
-                                                       "\\)\""
-                                                       );
-                                               if (boost::regex_search (tag, match, re) && match.size() == 5) {
-                                                       p.colour.r = raw_convert<int>(string(match[1])) / 255.0;
-                                                       p.colour.g = raw_convert<int>(string(match[2])) / 255.0;
-                                                       p.colour.b = raw_convert<int>(string(match[3])) / 255.0;
-                                                       colours.push_back (p.colour);
-                                               } else {
-                                                       throw SubripError (tag, "a colour in the format #rrggbb or rgba(rr,gg,bb,aa)", _context);
-                                               }
-                                       }
-                               } else if (tag == "/font") {
-                                       maybe_content (p);
-                                       SUB_ASSERT (!colours.empty());
-                                       colours.pop_back ();
-                                       p.colour = colours.back ();
-                               } else if (tag.size() > 0 && tag[0] == '\\') {
-                                       SSAReader::parse_style (p, tag, 288, 288);
-                               }
-                               tag.clear ();
-                               state = TEXT;
+                       ++i;
+                       if (boost::regex_search (tag, match, re) && string (match[1]).size() == 6) {
+                               p.colour = Colour::from_rgb_hex (match[1]);
+                               colours.push_back (p.colour);
                        } else {
-                               tag += tolower (t[i]);
+                               re = boost::regex (
+                                       ".*color=\"rgba\\("
+                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
+                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
+                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*,"
+                                       "[[:space:]]*([[:digit:]]+)[[:space:]]*"
+                                       "\\)\""
+                                       );
+                               if (boost::regex_search (tag, match, re) && match.size() == 5) {
+                                       p.colour.r = raw_convert<int>(string(match[1])) / 255.0;
+                                       p.colour.g = raw_convert<int>(string(match[2])) / 255.0;
+                                       p.colour.b = raw_convert<int>(string(match[3])) / 255.0;
+                                       colours.push_back (p.colour);
+                               } else {
+                                       throw SubripError (tag, "a colour in the format #rrggbb or rgba(rr,gg,bb,aa)", _context);
+                               }
                        }
-                       break;
+               } else if (has_next(t, i, "</font>")) {
+                       maybe_content (p);
+                       SUB_ASSERT (!colours.empty());
+                       colours.pop_back ();
+                       p.colour = colours.back ();
+               } else if (has_next(t, i, "{\\")) {
+                       string ssa = "\\";
+                       while (i < t.size() && t[i] != '}') {
+                               ssa += t[i];
+                               ++i;
+                       }
+                       ++i;
+                       SSAReader::parse_style (p, ssa, 288, 288);
+               } else {
+                       p.text += t[i];
+                       ++i;
                }
        }
 
index 818b6ad3ecd01f2d6335f02eabc8637e1e463517..2daf84a52b3e853b4d17a86f282abd501da63ffb 100644 (file)
@@ -307,6 +307,24 @@ BOOST_AUTO_TEST_CASE (subrip_reader_convert_line_test)
        ++i;
        r._subs.clear ();
 
+       rs = sub::RawSubtitle();
+       r.convert_line ("<B>This is <I>nesting</I> of subtitles</B>", rs);
+       BOOST_CHECK_EQUAL (r._subs.size(), 3);
+       i = r._subs.begin();
+       BOOST_CHECK_EQUAL (i->text, "This is ");
+       BOOST_CHECK_EQUAL (i->bold, true);
+       BOOST_CHECK_EQUAL (i->italic, false);
+       ++i;
+       BOOST_CHECK_EQUAL (i->text, "nesting");
+       BOOST_CHECK_EQUAL (i->bold, true);
+       BOOST_CHECK_EQUAL (i->italic, true);
+       ++i;
+       BOOST_CHECK_EQUAL (i->text, " of subtitles");
+       BOOST_CHECK_EQUAL (i->bold, true);
+       BOOST_CHECK_EQUAL (i->italic, false);
+       ++i;
+       r._subs.clear ();
+
        rs = sub::RawSubtitle();
        r.convert_line ("<font color=\"#ff00ff\">simple color</font>", rs);
        BOOST_CHECK_EQUAL (r._subs.size(), 1);
@@ -377,6 +395,13 @@ BOOST_AUTO_TEST_CASE (subrip_reader_convert_line_test)
        BOOST_CHECK (fabs (i->colour.g) < 0.01);
        BOOST_CHECK_CLOSE (i->colour.b, 1, 0.1);
        r._subs.clear ();
+
+       rs = sub::RawSubtitle();
+       r.convert_line ("<< angle brackets but no HTML >>", rs);
+       BOOST_CHECK_EQUAL (r._subs.size(), 1);
+       i = r._subs.begin ();
+       BOOST_CHECK_EQUAL (i->text, "<< angle brackets but no HTML >>");
+       r._subs.clear();
 }
 
 /** Test SubripReader::convert_time */