Add primitive WebVTT reader. v1.6.38
authorCarl Hetherington <cth@carlh.net>
Sun, 13 Nov 2022 18:46:35 +0000 (19:46 +0100)
committerCarl Hetherington <cth@carlh.net>
Sun, 13 Nov 2022 18:46:35 +0000 (19:46 +0100)
src/web_vtt_reader.cc [new file with mode: 0644]
src/web_vtt_reader.h [new file with mode: 0644]
src/wscript
test/data/test.vtt [new file with mode: 0644]
test/webvtt_reader_test.cc [new file with mode: 0644]
test/wscript

diff --git a/src/web_vtt_reader.cc b/src/web_vtt_reader.cc
new file mode 100644 (file)
index 0000000..2781654
--- /dev/null
@@ -0,0 +1,157 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+
+#include "exceptions.h"
+#include "subrip_reader.h"
+#include "util.h"
+#include "web_vtt_reader.h"
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string_regex.hpp>
+#include <boost/bind.hpp>
+#include <boost/regex.hpp>
+#include <iostream>
+#include <vector>
+
+
+using std::function;
+using std::string;
+using std::vector;
+using boost::optional;
+using namespace sub;
+
+
+WebVTTReader::WebVTTReader(FILE* file)
+{
+       this->read(boost::bind(&get_line_file, file));
+}
+
+
+WebVTTReader::WebVTTReader(string subs)
+{
+       this->read(boost::bind(&get_line_string, &subs));
+}
+
+
+void
+WebVTTReader::read(std::function<optional<string> ()> get_line)
+{
+       enum class State {
+               /* expecting WEBVTT */
+               HEADER,
+               /* awaiting a NOTE or a subtitle timing line */
+               DATA,
+               /* reading the text of a subtitle */
+               SUBTITLE,
+               /* reading a note */
+               NOTE
+       } state = State::HEADER;
+
+       RawSubtitle rs;
+
+       rs.vertical_position.line = 0;
+       rs.vertical_position.reference = TOP_OF_SUBTITLE;
+
+       while (true) {
+               auto line = get_line();
+               if (!line) {
+                       break;
+               }
+
+               trim_right_if(*line, boost::is_any_of("\n\r"));
+               remove_unicode_bom(line);
+
+               /* Keep some history in case there is an error to report */
+               _context.push_back(*line);
+               if (_context.size() > 5) {
+                       _context.pop_front();
+               }
+
+               switch (state) {
+               case State::HEADER:
+                       if (!boost::starts_with(*line, "WEBVTT")) {
+                               throw WebVTTError("No WEBVTT header found");
+                       }
+                       state = State::DATA;
+                       break;
+               case State::DATA:
+                       if (boost::starts_with(*line, "NOTE")) {
+                               state = State::NOTE;
+                       } else if (line->find("-->") != string::npos) {
+                               /* Further trim this line, removing spaces from the end */
+                               trim_right_if(*line, boost::is_any_of(" "));
+
+                               vector<string> parts;
+                               boost::algorithm::split(parts, *line, boost::algorithm::is_any_of(" "), boost::token_compress_on);
+
+                               if (parts.size() != 3 && parts.size() != 7) {
+                                       for (int i = 0; i < 2; ++i) {
+                                               auto ex = get_line();
+                                               if (ex) {
+                                                       _context.push_back(*ex);
+                                               }
+                                       }
+                                       throw WebVTTError(*line, "a time line", _context);
+                               }
+
+                               string expected;
+                               auto from = SubripReader::convert_time(parts[0], &expected);
+                               if (!from) {
+                                       throw WebVTTError(parts[0], expected, _context);
+                               }
+                               rs.from = *from;
+
+                               auto to = SubripReader::convert_time(parts[2], &expected);
+                               if (!to) {
+                                       throw WebVTTError(parts[2], expected, _context);
+                               }
+                               rs.to = *to;
+
+                               rs.vertical_position.line = 0;
+                               state = State::SUBTITLE;
+                       } else if (!line->empty()) {
+                               throw WebVTTError(*line, "a note or time", _context);
+                       }
+                       break;
+               case State::SUBTITLE:
+                       if (line->empty()) {
+                               state = State::DATA;
+                       } else {
+                               /* Split up this line on unicode "LINE SEPARATOR".  This feels hacky but also
+                                * the least unpleasant place to do it.
+                                */
+                               vector<string> sub_lines;
+                               boost::algorithm::split_regex(sub_lines, *line, boost::regex("\xe2\x80\xa8"));
+                               for (auto sub_line: sub_lines) {
+                                       rs.text = sub_line;
+                                       _subs.push_back(rs);
+                                       rs.vertical_position.line = rs.vertical_position.line.get() + 1;
+                               }
+                       }
+                       break;
+               case State::NOTE:
+                       if (line->empty()) {
+                               state = State::DATA;
+                       }
+                       break;
+               }
+
+       }
+}
+
diff --git a/src/web_vtt_reader.h b/src/web_vtt_reader.h
new file mode 100644 (file)
index 0000000..495e2bc
--- /dev/null
@@ -0,0 +1,49 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+
+#ifndef LIBSUB_WEB_VTT_READER_H
+#define LIBSUB_WEB_VTT_READER_H
+
+
+#include "reader.h"
+#include <cstdio>
+#include <string>
+
+
+namespace sub {
+
+
+class WebVTTReader : public Reader
+{
+public:
+       WebVTTReader(FILE* file);
+       WebVTTReader(std::string subs);
+
+private:
+       void read(std::function<boost::optional<std::string> ()> get_line);
+
+       std::list<std::string> _context;
+};
+
+
+}
+
+#endif
+
index 491111744d0e4c28d111afd7fe3fbf219d18ee15..ff029a073659e27941474b0030ecc2845bce233d 100644 (file)
@@ -38,6 +38,7 @@ def build(bld):
                  util.cc
                  vertical_reference.cc
                  vertical_position.cc
+                 web_vtt_reader.cc
                  """
 
     headers = """
@@ -63,6 +64,7 @@ def build(bld):
               subtitle.h
               vertical_position.h
               vertical_reference.h
+              web_vtt_reader.h
               """
 
     bld.install_files('${PREFIX}/include/libsub%s/sub' % bld.env.API_VERSION, headers)
diff --git a/test/data/test.vtt b/test/data/test.vtt
new file mode 100644 (file)
index 0000000..461c8a2
--- /dev/null
@@ -0,0 +1,11 @@
+WEBVTT - you can put something here
+
+NOTE You can have notes
+That span multiple lines
+
+00:00:41,090 --> 00:00:42,210
+This is a subtitle
+and that's a line break
+
+00:01:01,010 --> 00:01:02,100
+This is some stuff.
diff --git a/test/webvtt_reader_test.cc b/test/webvtt_reader_test.cc
new file mode 100644 (file)
index 0000000..024f89f
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+    Copyright (C) 2022 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include "web_vtt_reader.h"
+#include "subtitle.h"
+#include "test.h"
+#include "exceptions.h"
+#include "collect.h"
+#include <boost/test/unit_test.hpp>
+#include <boost/filesystem.hpp>
+#include <cmath>
+#include <iostream>
+#include <cstdio>
+
+
+using std::cerr;
+using std::vector;
+using std::fabs;
+
+
+/* Test reading of a VTT file */
+BOOST_AUTO_TEST_CASE(vtt_reader_test)
+{
+       auto f = fopen("test/data/test.vtt", "r");
+       sub::WebVTTReader reader(f);
+       fclose(f);
+       auto subs = sub::collect<std::vector<sub::Subtitle>>(reader.subtitles());
+
+       auto i = subs.begin();
+
+
+       /* First subtitle */
+
+       BOOST_REQUIRE(i != subs.end());
+       BOOST_CHECK_EQUAL(i->from, sub::Time::from_hms(0, 0, 41, 90));
+       BOOST_CHECK_EQUAL(i->to, sub::Time::from_hms(0, 0, 42, 210));
+
+       auto j = i->lines.begin();
+       BOOST_CHECK(j != i->lines.end());
+       BOOST_REQUIRE_EQUAL(j->blocks.size(), 1);
+       auto b = j->blocks[0];
+       BOOST_CHECK_EQUAL(b.text, "This is a subtitle");
+       /* No font is specified by WebVTT, so none should be seen here */
+       BOOST_CHECK(!b.font);
+       BOOST_CHECK(!b.font_size.specified());
+       BOOST_CHECK_EQUAL(b.bold, false);
+       BOOST_CHECK_EQUAL(b.italic, false);
+       BOOST_CHECK_EQUAL(b.underline, false);
+       BOOST_REQUIRE(j->vertical_position.line);
+       BOOST_CHECK_EQUAL(j->vertical_position.line.get(), 0);
+       BOOST_CHECK_EQUAL(j->vertical_position.reference.get(), sub::TOP_OF_SUBTITLE);
+       ++j;
+
+       BOOST_CHECK(j != i->lines.end());
+       BOOST_REQUIRE_EQUAL(j->blocks.size(), 1);
+       b = j->blocks[0];
+       BOOST_CHECK_EQUAL(b.text, "and that's a line break");
+       /* No font is specified by WebVTT, so none should be seen here */
+       BOOST_CHECK(!b.font);
+       BOOST_CHECK(!b.font_size.specified());
+       BOOST_CHECK_EQUAL(b.bold, false);
+       BOOST_CHECK_EQUAL(b.italic, false);
+       BOOST_CHECK_EQUAL(b.underline, false);
+       BOOST_REQUIRE(j->vertical_position.line);
+       BOOST_CHECK_EQUAL(j->vertical_position.line.get(), 1);
+       BOOST_CHECK_EQUAL(j->vertical_position.reference.get(), sub::TOP_OF_SUBTITLE);
+       ++i;
+
+
+       /* Second subtitle */
+
+       BOOST_REQUIRE(i != subs.end());
+       BOOST_CHECK_EQUAL(i->from, sub::Time::from_hms(0, 1, 1, 10));
+       BOOST_CHECK_EQUAL(i->to, sub::Time::from_hms(0, 1, 2, 100));
+
+       BOOST_CHECK_EQUAL(i->lines.size(), 1);
+       sub::Line l = i->lines[0];
+       BOOST_CHECK_EQUAL(l.blocks.size(), 1);
+       BOOST_CHECK_EQUAL(l.vertical_position.line.get(), 0);
+       BOOST_CHECK_EQUAL(l.vertical_position.reference.get(), sub::TOP_OF_SUBTITLE);
+
+       BOOST_REQUIRE_EQUAL(l.blocks.size(), 1U);
+       b = l.blocks[0];
+       BOOST_CHECK_EQUAL(b.text, "This is some stuff.");
+       /* No font is specified by WebVTT, so none should be seen here */
+       BOOST_CHECK(!b.font);
+       BOOST_CHECK(!b.font_size.specified());
+       BOOST_CHECK_EQUAL(b.bold, false);
+       BOOST_CHECK_EQUAL(b.italic, false);
+       BOOST_CHECK_EQUAL(b.underline, false);
+}
+
index b7d91cb0dd7a61aacbffa45350d0bd8bbcdf7823..4653bd0e6bd31fef6b332229dcf499829c63aabd 100644 (file)
@@ -31,6 +31,7 @@ def build(bld):
                  time_test.cc
                  test.cc
                  vertical_position_test.cc
+                 webvtt_reader_test.cc
                  """
     obj.target = 'tests'
     obj.install_path = ''