Handle unicode LINE SEPARATOR properly in subrip files. v1.6.37
authorCarl Hetherington <cth@carlh.net>
Thu, 10 Nov 2022 10:25:57 +0000 (11:25 +0100)
committerCarl Hetherington <cth@carlh.net>
Thu, 10 Nov 2022 10:25:57 +0000 (11:25 +0100)
src/subrip_reader.cc
test/data/newline.srt [new file with mode: 0644]
test/subrip_reader_test.cc

index 7c7b5c2c74c5a07068c05818b0c339f7e6259326..f0fe07f0eb997c1e5968dafab243f7b1985f8033 100644 (file)
@@ -28,6 +28,7 @@
 #include "raw_convert.h"
 #include "ssa_reader.h"
 #include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string_regex.hpp>
 #include <boost/lexical_cast.hpp>
 #include <boost/regex.hpp>
 #include <boost/bind.hpp>
@@ -135,8 +136,16 @@ SubripReader::read (function<optional<string> ()> get_line)
                        if (line->empty ()) {
                                state = COUNTER;
                        } else {
-                               convert_line (*line, rs);
-                               rs.vertical_position.line = rs.vertical_position.line.get() + 1;
+                               vector<string> sub_lines;
+                               /* Split up this line on unicode "LINE SEPARATOR".  This feels hacky but also
+                                * the least unpleasant place to do it.
+                                */
+                               boost::algorithm::split_regex(sub_lines, *line, boost::regex("\xe2\x80\xa8"));
+                               for (auto sub_line: sub_lines) {
+                                       convert_line(sub_line, rs);
+                                       rs.vertical_position.line = rs.vertical_position.line.get() + 1;
+                                       rs.text.clear();
+                               }
                        }
                        break;
                }
diff --git a/test/data/newline.srt b/test/data/newline.srt
new file mode 100644 (file)
index 0000000..a7b1660
--- /dev/null
@@ -0,0 +1,8 @@
+1
+00:01:08,234 --> 00:01:10,570
+Du fühlst dich danach besser.
Okay, Kleiner?
+
+2
+00:01:40,642 --> 00:01:42,769
+Sie kann es nicht machen
+wenn du dich bewegst.
index 2daf84a52b3e853b4d17a86f282abd501da63ffb..b364d570b60a4569f4a68a2b5f98991e5ed23961 100644 (file)
@@ -656,3 +656,27 @@ BOOST_AUTO_TEST_CASE (subrip_reader_test6)
        r._subs.clear ();
 }
 
+
+BOOST_AUTO_TEST_CASE(subrip_with_unicode_line_separator_test)
+{
+       auto f = fopen ("test/data/newline.srt", "r");
+       BOOST_REQUIRE(f);
+       sub::SubripReader reader(f);
+       fclose(f);
+       auto subs = sub::collect<std::vector<sub::Subtitle>>(reader.subtitles());
+
+       BOOST_REQUIRE_EQUAL(subs.size(), 2U);
+
+       BOOST_REQUIRE_EQUAL(subs[0].lines.size(), 2U);
+       BOOST_REQUIRE_EQUAL(subs[0].lines[0].blocks.size(), 1U);
+       BOOST_CHECK_EQUAL(subs[0].lines[0].blocks[0].text, "Du fühlst dich danach besser.");
+       BOOST_REQUIRE_EQUAL(subs[0].lines[1].blocks.size(), 1U);
+       BOOST_CHECK_EQUAL(subs[0].lines[1].blocks[0].text, "Okay, Kleiner?");
+
+       BOOST_REQUIRE_EQUAL(subs[1].lines.size(), 2U);
+       BOOST_REQUIRE_EQUAL(subs[1].lines[0].blocks.size(), 1U);
+       BOOST_CHECK_EQUAL(subs[1].lines[0].blocks[0].text, "Sie kann es nicht machen");
+       BOOST_REQUIRE_EQUAL(subs[1].lines[1].blocks.size(), 1U);
+       BOOST_CHECK_EQUAL(subs[1].lines[1].blocks[0].text, "wenn du dich bewegst.");
+}
+