From: Carl Hetherington Date: Tue, 22 Sep 2015 15:15:08 +0000 (+0100) Subject: Use uchardet to guess encoding of subtitle files and reject non-UTF-8. X-Git-Tag: v2.3.6~28 X-Git-Url: https://main.carlh.net/gitweb/?p=dcpomatic.git;a=commitdiff_plain;h=bdbe925a467f9b7149322ad8d1c090d4c1e6d5c3 Use uchardet to guess encoding of subtitle files and reject non-UTF-8. --- diff --git a/src/lib/exceptions.h b/src/lib/exceptions.h index 7240611ee..6939f81a3 100644 --- a/src/lib/exceptions.h +++ b/src/lib/exceptions.h @@ -263,4 +263,12 @@ public: ProgrammingError (std::string file, int line); }; +class TextEncodingError : public StringError +{ +public: + TextEncodingError (std::string s) + : StringError (s) + {} +}; + #endif diff --git a/src/lib/subrip.cc b/src/lib/subrip.cc index f19867952..d4adee428 100644 --- a/src/lib/subrip.cc +++ b/src/lib/subrip.cc @@ -23,10 +23,14 @@ #include "subrip_content.h" #include #include +#include +#include #include "i18n.h" using std::vector; +using std::cout; +using std::string; using boost::shared_ptr; SubRip::SubRip (shared_ptr content) @@ -36,6 +40,25 @@ SubRip::SubRip (shared_ptr content) throw OpenFileError (content->path (0)); } + /* Guess the encoding */ + uchardet_t det = uchardet_new (); + char buffer[1024]; + while (!feof (f)) { + int const n = fread (buffer, 1, sizeof (buffer), f); + if (uchardet_handle_data (det, buffer, n)) { + break; + } + } + + uchardet_data_end (det); + string charset = uchardet_get_charset (det); + uchardet_delete (det); + + if (charset != "UTF-8") { + throw TextEncodingError (_("unrecognised character set; please use files encoded in UTF-8")); + } + + rewind (f); sub::SubripReader reader (f); _subtitles = sub::collect > (reader.subtitles ()); } diff --git a/src/tools/wscript b/src/tools/wscript index 33a631e6e..b01eee7ca 100644 --- a/src/tools/wscript +++ b/src/tools/wscript @@ -29,6 +29,7 @@ def configure(conf): def build(bld): uselib = 'BOOST_THREAD BOOST_DATETIME OPENJPEG DCP XMLSEC CXML XMLPP AVFORMAT AVFILTER AVCODEC ' uselib += 'AVUTIL SWSCALE POSTPROC CURL BOOST_FILESYSTEM SSH ZIP CAIROMM FONTCONFIG PANGOMM SUB MAGICK SNDFILE SAMPLERATE BOOST_REGEX ' + uselib += 'UCHARDET ' if bld.env.TARGET_WINDOWS: uselib += 'WINSOCK2' diff --git a/test/wscript b/test/wscript index 1a1038e8d..a92e344eb 100644 --- a/test/wscript +++ b/test/wscript @@ -31,7 +31,7 @@ def build(bld): obj = bld(features='cxx cxxprogram') obj.name = 'unit-tests' obj.uselib = 'BOOST_TEST BOOST_THREAD BOOST_FILESYSTEM BOOST_DATETIME SNDFILE SAMPLERATE DCP OPENJPEG FONTCONFIG CAIROMM PANGOMM XMLPP ' - obj.uselib += 'AVFORMAT AVFILTER AVCODEC AVUTIL SWSCALE POSTPROC CXML MAGICK SUB GLIB CURL SSH XMLSEC BOOST_REGEX ' + obj.uselib += 'AVFORMAT AVFILTER AVCODEC AVUTIL SWSCALE POSTPROC CXML MAGICK SUB GLIB CURL SSH XMLSEC BOOST_REGEX UCHARDET ' if bld.env.TARGET_WINDOWS: obj.uselib += 'WINSOCK2' obj.use = 'libdcpomatic2' diff --git a/wscript b/wscript index 3e84dcac5..150e9cb16 100644 --- a/wscript +++ b/wscript @@ -159,6 +159,8 @@ def configure(conf): else: conf.check_cfg(package='libcurl', args='--cflags --libs', uselib_store='CURL', mandatory=True) + # uchardet + conf.check_cfg(package='uchardet', args='--cflags --libs', uselib_store='UCHARDET', mandatory=True) # libsndfile conf.check_cfg(package='sndfile', args='--cflags --libs', uselib_store='SNDFILE', mandatory=True)