More STL binary reading stuff.
authorCarl Hetherington <cth@carlh.net>
Wed, 28 May 2014 10:50:15 +0000 (11:50 +0100)
committerCarl Hetherington <cth@carlh.net>
Wed, 28 May 2014 10:50:15 +0000 (11:50 +0100)
16 files changed:
.gitignore
run/tests
src/iso6937.cc [new file with mode: 0644]
src/iso6937.h [new file with mode: 0644]
src/iso6937.py [new file with mode: 0644]
src/iso6937_tables.cc [new file with mode: 0644]
src/iso6937_tables.h [new file with mode: 0644]
src/stl_binary_reader.cc
src/stl_binary_reader.h
src/subtitle.h
src/wscript
test/iso6937_test.cc [new file with mode: 0644]
test/test.cc
test/wscript
tools/dumpsubs.cc
wscript

index bdeda98fa14e4dea69e325228275a6b0df8586bd..f51d5cde47125ac70616ef31efe9a54fae28ddb7 100644 (file)
@@ -2,4 +2,5 @@
 build
 .waf-*
 .lock-waf*
-src/version.cc
\ No newline at end of file
+src/version.cc
+UnicodeData.txt
index d32bc9adf3ec78dbb1101fe00e25b3bd6564f796..6858a3437fe1fda14e4b6b6e029212b70ba783a4 100755 (executable)
--- a/run/tests
+++ b/run/tests
@@ -1,4 +1,4 @@
 #!/bin/bash -e
 
 export LD_LIBRARY_PATH=build/src
-build/test/tests ../libsub-test-private
+build/test/tests ../libsub-test-private $*
diff --git a/src/iso6937.cc b/src/iso6937.cc
new file mode 100644 (file)
index 0000000..d6c1970
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <string>
+#include <boost/optional.hpp>
+#include "iso6937_tables.h"
+#include "iso6937.h"
+
+using std::string;
+using std::cout;
+using std::wstring;
+using namespace sub;
+
+wstring
+sub::iso6937_to_utf16 (char const * s)
+{
+       if (iso6937::grave.empty ()) {
+               make_iso6937_tables ();
+       }
+       
+       wstring o;
+
+       boost::optional<unsigned char> diacritical;
+
+       while (*s != '\0') {
+               unsigned char const u = static_cast<unsigned char> (*s);
+               if (u >= 0xc1 && u <= 0xcf) {
+                       diacritical = u;
+               } else if (diacritical) {
+                       switch (diacritical.get ()) {
+                       case 0xC1:
+                               o += iso6937::grave[u];
+                               break;
+                       case 0xC2:
+                               o += iso6937::acute[u];
+                               break;
+                       case 0xC3:
+                               o += iso6937::circumflex[u];
+                               break;
+                       case 0xC4:
+                               o += iso6937::tilde[u];
+                               break;
+                       case 0xC5:
+                               o += iso6937::macron[u];
+                               break;
+                       case 0xC6:
+                               o += iso6937::breve[u];
+                               break;
+                       case 0xC7:
+                               o += iso6937::dot[u];
+                               break;
+                       case 0xC8:
+                               o += iso6937::diaeresis[u];
+                               break;
+                       case 0xCA:
+                               o += iso6937::ring[u];
+                               break;
+                       case 0xCB:
+                               o += iso6937::cedilla[u];
+                               break;
+                       case 0xCD:
+                               o += iso6937::double_acute[u];
+                               break;
+                       case 0xCE:
+                               o += iso6937::ogonek[u];
+                               break;
+                       case 0xCF:
+                               o += iso6937::caron[u];
+                               break;
+                       }
+
+                       diacritical.reset ();
+               } else {
+                       o += iso6937::main[u];
+               }
+
+               ++s;
+       }
+
+       return o;
+}
diff --git a/src/iso6937.h b/src/iso6937.h
new file mode 100644 (file)
index 0000000..700602d
--- /dev/null
@@ -0,0 +1,24 @@
+/*
+    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+namespace sub {
+
+       extern std::wstring iso6937_to_utf16 (char const * s);
+
+};
diff --git a/src/iso6937.py b/src/iso6937.py
new file mode 100644 (file)
index 0000000..4719b07
--- /dev/null
@@ -0,0 +1,227 @@
+import urllib2
+import sys
+
+DATA = 'http://www.unicode.org/Public/6.0.0/ucd/UnicodeData.txt'
+OUTPUT = 'src/iso6937_tables'
+
+data = urllib2.urlopen(DATA).read()
+# data = open('UnicodeData.txt').read()
+output_c = open(OUTPUT + '.cc', 'w')
+output_h = open(OUTPUT + '.h', 'w')
+
+def find_unicode(n):
+    for line in iter(data.splitlines()):
+        s = line.split(';')
+        if s[1] == n:
+            return s[0]
+
+    print 'Could not find %s' % n
+    sys.exit(1)
+
+def setup(output_diacritical_name):
+    print>>output_c,'map<char, wchar_t> sub::iso6937::%s;' % output_diacritical_name
+
+def fill(unicode_diacritical_name, output_diacritical_name, letters):
+
+    print>>output_h,'extern std::map<char, wchar_t> %s;' % output_diacritical_name
+    
+    for letter in letters:
+        if letter.isupper():
+            case = 'CAPITAL'
+        else:
+            case = 'SMALL'
+
+        unicode_name = 'LATIN %s LETTER %s WITH %s' % (case, letter.upper(), unicode_diacritical_name)
+        print>>output_c,"\t%s['%s'] = 0x%s;" % (output_diacritical_name, letter, find_unicode(unicode_name))
+
+    print>>output_c,""
+
+print>>output_c,"""/*
+    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+/* THIS FILE WAS AUTO-GENERATED BY iso6937.py */
+
+#include <map>
+#include "iso6937_tables.h"
+
+using std::map;
+"""
+
+print>>output_h,"""/*
+    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+/* THIS FILE WAS AUTO-GENERATED BY iso6937.py */
+
+#include <map>
+
+namespace sub {
+
+extern void make_iso6937_tables ();
+
+namespace iso6937 {
+"""
+
+groups = [
+    ('GRAVE', 'grave', 'AEIOUaeiou'),
+    ('ACUTE', 'acute', 'ACEILNORSUYZacegilnorsuyz'),
+    ('CIRCUMFLEX', 'circumflex', 'ACEGHIJOSUWYaceghijosuwy'),
+    ('TILDE', 'tilde', 'AINOUainou'),
+    ('MACRON', 'macron', 'AEIOUaeiou'),
+    ('BREVE', 'breve', 'AGUagu'),
+    ('DOT ABOVE', 'dot', 'CEGIZcegz'),
+    ('DIAERESIS', 'diaeresis', 'AEIOUYaeiouy'),
+    ('RING ABOVE', 'ring', 'AUau'),
+    ('CEDILLA', 'cedilla', 'CGKLNRSTcklnrst'),
+    ('DOUBLE ACUTE', 'double_acute', 'OUou'),
+    ('OGONEK', 'ogonek', 'AEIUaeui'),
+    ('CARON', 'caron', 'CDELNRSTZcdelnrstz')
+]
+
+for g in groups:
+    setup(g[1])
+
+print>>output_c,"map<char, wchar_t> sub::iso6937::main;"
+print>>output_h,"extern std::map<char, wchar_t> main;"
+
+print>>output_c,"""
+void
+sub::make_iso6937_tables ()
+{
+\tusing namespace sub::iso6937;
+"""
+
+for g in groups:
+    fill(g[0], g[1], g[2])
+
+print>>output_c,"\tmain[10] = 0x000A;"
+
+for i in range(32, 127):
+    if chr(i) == "'" or chr(i) == "\\":
+        print>>output_c,"\tmain['\\%s'] = 0x00%x;" % (chr(i), i)
+    else:
+        print>>output_c,"\tmain['%s'] = 0x00%x;" % (chr(i), i)
+
+# From Wikipedia
+# http://en.wikipedia.org/wiki/ISO/IEC_6937
+print>>output_c,"\tmain[161] = 0x00A1;"
+print>>output_c,"\tmain[162] = 0x00A2;"
+print>>output_c,"\tmain[163] = 0x00A3;"
+print>>output_c,"\tmain[165] = 0x00A5;"
+print>>output_c,"\tmain[167] = 0x00A7;"
+print>>output_c,"\tmain[168] = 0x00A4;"
+print>>output_c,"\tmain[169] = 0x2018;"
+print>>output_c,"\tmain[170] = 0x201C;"
+print>>output_c,"\tmain[171] = 0x00AB;"
+print>>output_c,"\tmain[172] = 0x2190;"
+print>>output_c,"\tmain[173] = 0x2191;"
+print>>output_c,"\tmain[174] = 0x2192;"
+print>>output_c,"\tmain[175] = 0x2193;"
+print>>output_c,"\tmain[176] = 0x00B0;"
+print>>output_c,"\tmain[177] = 0x00B1;"
+print>>output_c,"\tmain[178] = 0x00B2;"
+print>>output_c,"\tmain[179] = 0x00B3;"
+print>>output_c,"\tmain[180] = 0x00D7;"
+print>>output_c,"\tmain[181] = 0x00B5;"
+print>>output_c,"\tmain[182] = 0x00B6;"
+print>>output_c,"\tmain[183] = 0x00B7;"
+print>>output_c,"\tmain[184] = 0x00F7;"
+print>>output_c,"\tmain[185] = 0x2019;"
+print>>output_c,"\tmain[186] = 0x201D;"
+print>>output_c,"\tmain[187] = 0x00BB;"
+print>>output_c,"\tmain[188] = 0x00BC;"
+print>>output_c,"\tmain[189] = 0x00BD;"
+print>>output_c,"\tmain[190] = 0x00BE;"
+print>>output_c,"\tmain[191] = 0x00BF;"
+print>>output_c,"\tmain[193] = 0x0300;"
+print>>output_c,"\tmain[194] = 0x0301;"
+print>>output_c,"\tmain[195] = 0x0302;"
+print>>output_c,"\tmain[196] = 0x0303;"
+print>>output_c,"\tmain[197] = 0x0304;"
+print>>output_c,"\tmain[198] = 0x0306;"
+print>>output_c,"\tmain[199] = 0x0307;"
+print>>output_c,"\tmain[200] = 0x0308;"
+print>>output_c,"\tmain[202] = 0x030A;"
+print>>output_c,"\tmain[203] = 0x0327;"
+print>>output_c,"\tmain[205] = 0x030B;"
+print>>output_c,"\tmain[206] = 0x032B;"
+print>>output_c,"\tmain[207] = 0x030C;"
+print>>output_c,"\tmain[208] = 0x2015;"
+print>>output_c,"\tmain[209] = 0x00B9;"
+print>>output_c,"\tmain[210] = 0x00AE;"
+print>>output_c,"\tmain[211] = 0x00A9;"
+print>>output_c,"\tmain[212] = 0x2122;"
+print>>output_c,"\tmain[213] = 0x266A;"
+print>>output_c,"\tmain[214] = 0x00AC;"
+print>>output_c,"\tmain[215] = 0x00A6;"
+print>>output_c,"\tmain[220] = 0x215B;"
+print>>output_c,"\tmain[221] = 0x215C;"
+print>>output_c,"\tmain[222] = 0x215D;"
+print>>output_c,"\tmain[223] = 0x215E;"
+print>>output_c,"\tmain[224] = 0x2126;"
+print>>output_c,"\tmain[225] = 0x00C6;"
+print>>output_c,"\tmain[226] = 0x0110;"
+print>>output_c,"\tmain[227] = 0x00AA;"
+print>>output_c,"\tmain[228] = 0x0126;"
+print>>output_c,"\tmain[230] = 0x0132;"
+print>>output_c,"\tmain[231] = 0x013F;"
+print>>output_c,"\tmain[232] = 0x0141;"
+print>>output_c,"\tmain[233] = 0x00D8;"
+print>>output_c,"\tmain[234] = 0x0152;"
+print>>output_c,"\tmain[235] = 0x00BA;"
+print>>output_c,"\tmain[236] = 0x00DE;"
+print>>output_c,"\tmain[237] = 0x0166;"
+print>>output_c,"\tmain[238] = 0x014A;"
+print>>output_c,"\tmain[239] = 0x0149;"
+print>>output_c,"\tmain[240] = 0x0138;"
+print>>output_c,"\tmain[241] = 0x00E6;"
+print>>output_c,"\tmain[242] = 0x0111;"
+print>>output_c,"\tmain[243] = 0x00F0;"
+print>>output_c,"\tmain[244] = 0x0127;"
+print>>output_c,"\tmain[245] = 0x0131;"
+print>>output_c,"\tmain[246] = 0x0133;"
+print>>output_c,"\tmain[247] = 0x0140;"
+print>>output_c,"\tmain[248] = 0x0142;"
+print>>output_c,"\tmain[249] = 0x00F8;"
+print>>output_c,"\tmain[250] = 0x0153;"
+print>>output_c,"\tmain[251] = 0x00DF;"
+print>>output_c,"\tmain[252] = 0x00FE;"
+print>>output_c,"\tmain[253] = 0x0167;"
+print>>output_c,"\tmain[254] = 0x014B;"
+print>>output_c,"\tmain[255] = 0x00AD;"
+
+print>>output_c,"}"
+print>>output_h,""
+print>>output_h,"}"
+print>>output_h,"}"
diff --git a/src/iso6937_tables.cc b/src/iso6937_tables.cc
new file mode 100644 (file)
index 0000000..07174c4
--- /dev/null
@@ -0,0 +1,396 @@
+/*
+    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+/* THIS FILE WAS AUTO-GENERATED BY iso6937.py */
+
+#include <map>
+#include "iso6937_tables.h"
+
+using std::map;
+
+map<char, wchar_t> sub::iso6937::grave;
+map<char, wchar_t> sub::iso6937::acute;
+map<char, wchar_t> sub::iso6937::circumflex;
+map<char, wchar_t> sub::iso6937::tilde;
+map<char, wchar_t> sub::iso6937::macron;
+map<char, wchar_t> sub::iso6937::breve;
+map<char, wchar_t> sub::iso6937::dot;
+map<char, wchar_t> sub::iso6937::diaeresis;
+map<char, wchar_t> sub::iso6937::ring;
+map<char, wchar_t> sub::iso6937::cedilla;
+map<char, wchar_t> sub::iso6937::double_acute;
+map<char, wchar_t> sub::iso6937::ogonek;
+map<char, wchar_t> sub::iso6937::caron;
+map<char, wchar_t> sub::iso6937::main;
+
+void
+sub::make_iso6937_tables ()
+{
+       using namespace sub::iso6937;
+
+       grave['A'] = 0x00C0;
+       grave['E'] = 0x00C8;
+       grave['I'] = 0x00CC;
+       grave['O'] = 0x00D2;
+       grave['U'] = 0x00D9;
+       grave['a'] = 0x00E0;
+       grave['e'] = 0x00E8;
+       grave['i'] = 0x00EC;
+       grave['o'] = 0x00F2;
+       grave['u'] = 0x00F9;
+
+       acute['A'] = 0x00C1;
+       acute['C'] = 0x0106;
+       acute['E'] = 0x00C9;
+       acute['I'] = 0x00CD;
+       acute['L'] = 0x0139;
+       acute['N'] = 0x0143;
+       acute['O'] = 0x00D3;
+       acute['R'] = 0x0154;
+       acute['S'] = 0x015A;
+       acute['U'] = 0x00DA;
+       acute['Y'] = 0x00DD;
+       acute['Z'] = 0x0179;
+       acute['a'] = 0x00E1;
+       acute['c'] = 0x0107;
+       acute['e'] = 0x00E9;
+       acute['g'] = 0x01F5;
+       acute['i'] = 0x00ED;
+       acute['l'] = 0x013A;
+       acute['n'] = 0x0144;
+       acute['o'] = 0x00F3;
+       acute['r'] = 0x0155;
+       acute['s'] = 0x015B;
+       acute['u'] = 0x00FA;
+       acute['y'] = 0x00FD;
+       acute['z'] = 0x017A;
+
+       circumflex['A'] = 0x00C2;
+       circumflex['C'] = 0x0108;
+       circumflex['E'] = 0x00CA;
+       circumflex['G'] = 0x011C;
+       circumflex['H'] = 0x0124;
+       circumflex['I'] = 0x00CE;
+       circumflex['J'] = 0x0134;
+       circumflex['O'] = 0x00D4;
+       circumflex['S'] = 0x015C;
+       circumflex['U'] = 0x00DB;
+       circumflex['W'] = 0x0174;
+       circumflex['Y'] = 0x0176;
+       circumflex['a'] = 0x00E2;
+       circumflex['c'] = 0x0109;
+       circumflex['e'] = 0x00EA;
+       circumflex['g'] = 0x011D;
+       circumflex['h'] = 0x0125;
+       circumflex['i'] = 0x00EE;
+       circumflex['j'] = 0x0135;
+       circumflex['o'] = 0x00F4;
+       circumflex['s'] = 0x015D;
+       circumflex['u'] = 0x00FB;
+       circumflex['w'] = 0x0175;
+       circumflex['y'] = 0x0177;
+
+       tilde['A'] = 0x00C3;
+       tilde['I'] = 0x0128;
+       tilde['N'] = 0x00D1;
+       tilde['O'] = 0x00D5;
+       tilde['U'] = 0x0168;
+       tilde['a'] = 0x00E3;
+       tilde['i'] = 0x0129;
+       tilde['n'] = 0x00F1;
+       tilde['o'] = 0x00F5;
+       tilde['u'] = 0x0169;
+
+       macron['A'] = 0x0100;
+       macron['E'] = 0x0112;
+       macron['I'] = 0x012A;
+       macron['O'] = 0x014C;
+       macron['U'] = 0x016A;
+       macron['a'] = 0x0101;
+       macron['e'] = 0x0113;
+       macron['i'] = 0x012B;
+       macron['o'] = 0x014D;
+       macron['u'] = 0x016B;
+
+       breve['A'] = 0x0102;
+       breve['G'] = 0x011E;
+       breve['U'] = 0x016C;
+       breve['a'] = 0x0103;
+       breve['g'] = 0x011F;
+       breve['u'] = 0x016D;
+
+       dot['C'] = 0x010A;
+       dot['E'] = 0x0116;
+       dot['G'] = 0x0120;
+       dot['I'] = 0x0130;
+       dot['Z'] = 0x017B;
+       dot['c'] = 0x010B;
+       dot['e'] = 0x0117;
+       dot['g'] = 0x0121;
+       dot['z'] = 0x017C;
+
+       diaeresis['A'] = 0x00C4;
+       diaeresis['E'] = 0x00CB;
+       diaeresis['I'] = 0x00CF;
+       diaeresis['O'] = 0x00D6;
+       diaeresis['U'] = 0x00DC;
+       diaeresis['Y'] = 0x0178;
+       diaeresis['a'] = 0x00E4;
+       diaeresis['e'] = 0x00EB;
+       diaeresis['i'] = 0x00EF;
+       diaeresis['o'] = 0x00F6;
+       diaeresis['u'] = 0x00FC;
+       diaeresis['y'] = 0x00FF;
+
+       ring['A'] = 0x00C5;
+       ring['U'] = 0x016E;
+       ring['a'] = 0x00E5;
+       ring['u'] = 0x016F;
+
+       cedilla['C'] = 0x00C7;
+       cedilla['G'] = 0x0122;
+       cedilla['K'] = 0x0136;
+       cedilla['L'] = 0x013B;
+       cedilla['N'] = 0x0145;
+       cedilla['R'] = 0x0156;
+       cedilla['S'] = 0x015E;
+       cedilla['T'] = 0x0162;
+       cedilla['c'] = 0x00E7;
+       cedilla['k'] = 0x0137;
+       cedilla['l'] = 0x013C;
+       cedilla['n'] = 0x0146;
+       cedilla['r'] = 0x0157;
+       cedilla['s'] = 0x015F;
+       cedilla['t'] = 0x0163;
+
+       double_acute['O'] = 0x0150;
+       double_acute['U'] = 0x0170;
+       double_acute['o'] = 0x0151;
+       double_acute['u'] = 0x0171;
+
+       ogonek['A'] = 0x0104;
+       ogonek['E'] = 0x0118;
+       ogonek['I'] = 0x012E;
+       ogonek['U'] = 0x0172;
+       ogonek['a'] = 0x0105;
+       ogonek['e'] = 0x0119;
+       ogonek['u'] = 0x0173;
+       ogonek['i'] = 0x012F;
+
+       caron['C'] = 0x010C;
+       caron['D'] = 0x010E;
+       caron['E'] = 0x011A;
+       caron['L'] = 0x013D;
+       caron['N'] = 0x0147;
+       caron['R'] = 0x0158;
+       caron['S'] = 0x0160;
+       caron['T'] = 0x0164;
+       caron['Z'] = 0x017D;
+       caron['c'] = 0x010D;
+       caron['d'] = 0x010F;
+       caron['e'] = 0x011B;
+       caron['l'] = 0x013E;
+       caron['n'] = 0x0148;
+       caron['r'] = 0x0159;
+       caron['s'] = 0x0161;
+       caron['t'] = 0x0165;
+       caron['z'] = 0x017E;
+
+       main[10] = 0x000A;
+       main[' '] = 0x0020;
+       main['!'] = 0x0021;
+       main['"'] = 0x0022;
+       main['#'] = 0x0023;
+       main['$'] = 0x0024;
+       main['%'] = 0x0025;
+       main['&'] = 0x0026;
+       main['\''] = 0x0027;
+       main['('] = 0x0028;
+       main[')'] = 0x0029;
+       main['*'] = 0x002a;
+       main['+'] = 0x002b;
+       main[','] = 0x002c;
+       main['-'] = 0x002d;
+       main['.'] = 0x002e;
+       main['/'] = 0x002f;
+       main['0'] = 0x0030;
+       main['1'] = 0x0031;
+       main['2'] = 0x0032;
+       main['3'] = 0x0033;
+       main['4'] = 0x0034;
+       main['5'] = 0x0035;
+       main['6'] = 0x0036;
+       main['7'] = 0x0037;
+       main['8'] = 0x0038;
+       main['9'] = 0x0039;
+       main[':'] = 0x003a;
+       main[';'] = 0x003b;
+       main['<'] = 0x003c;
+       main['='] = 0x003d;
+       main['>'] = 0x003e;
+       main['?'] = 0x003f;
+       main['@'] = 0x0040;
+       main['A'] = 0x0041;
+       main['B'] = 0x0042;
+       main['C'] = 0x0043;
+       main['D'] = 0x0044;
+       main['E'] = 0x0045;
+       main['F'] = 0x0046;
+       main['G'] = 0x0047;
+       main['H'] = 0x0048;
+       main['I'] = 0x0049;
+       main['J'] = 0x004a;
+       main['K'] = 0x004b;
+       main['L'] = 0x004c;
+       main['M'] = 0x004d;
+       main['N'] = 0x004e;
+       main['O'] = 0x004f;
+       main['P'] = 0x0050;
+       main['Q'] = 0x0051;
+       main['R'] = 0x0052;
+       main['S'] = 0x0053;
+       main['T'] = 0x0054;
+       main['U'] = 0x0055;
+       main['V'] = 0x0056;
+       main['W'] = 0x0057;
+       main['X'] = 0x0058;
+       main['Y'] = 0x0059;
+       main['Z'] = 0x005a;
+       main['['] = 0x005b;
+       main['\\'] = 0x005c;
+       main[']'] = 0x005d;
+       main['^'] = 0x005e;
+       main['_'] = 0x005f;
+       main['`'] = 0x0060;
+       main['a'] = 0x0061;
+       main['b'] = 0x0062;
+       main['c'] = 0x0063;
+       main['d'] = 0x0064;
+       main['e'] = 0x0065;
+       main['f'] = 0x0066;
+       main['g'] = 0x0067;
+       main['h'] = 0x0068;
+       main['i'] = 0x0069;
+       main['j'] = 0x006a;
+       main['k'] = 0x006b;
+       main['l'] = 0x006c;
+       main['m'] = 0x006d;
+       main['n'] = 0x006e;
+       main['o'] = 0x006f;
+       main['p'] = 0x0070;
+       main['q'] = 0x0071;
+       main['r'] = 0x0072;
+       main['s'] = 0x0073;
+       main['t'] = 0x0074;
+       main['u'] = 0x0075;
+       main['v'] = 0x0076;
+       main['w'] = 0x0077;
+       main['x'] = 0x0078;
+       main['y'] = 0x0079;
+       main['z'] = 0x007a;
+       main['{'] = 0x007b;
+       main['|'] = 0x007c;
+       main['}'] = 0x007d;
+       main['~'] = 0x007e;
+       main[161] = 0x00A1;
+       main[162] = 0x00A2;
+       main[163] = 0x00A3;
+       main[165] = 0x00A5;
+       main[167] = 0x00A7;
+       main[168] = 0x00A4;
+       main[169] = 0x2018;
+       main[170] = 0x201C;
+       main[171] = 0x00AB;
+       main[172] = 0x2190;
+       main[173] = 0x2191;
+       main[174] = 0x2192;
+       main[175] = 0x2193;
+       main[176] = 0x00B0;
+       main[177] = 0x00B1;
+       main[178] = 0x00B2;
+       main[179] = 0x00B3;
+       main[180] = 0x00D7;
+       main[181] = 0x00B5;
+       main[182] = 0x00B6;
+       main[183] = 0x00B7;
+       main[184] = 0x00F7;
+       main[185] = 0x2019;
+       main[186] = 0x201D;
+       main[187] = 0x00BB;
+       main[188] = 0x00BC;
+       main[189] = 0x00BD;
+       main[190] = 0x00BE;
+       main[191] = 0x00BF;
+       main[193] = 0x0300;
+       main[194] = 0x0301;
+       main[195] = 0x0302;
+       main[196] = 0x0303;
+       main[197] = 0x0304;
+       main[198] = 0x0306;
+       main[199] = 0x0307;
+       main[200] = 0x0308;
+       main[202] = 0x030A;
+       main[203] = 0x0327;
+       main[205] = 0x030B;
+       main[206] = 0x032B;
+       main[207] = 0x030C;
+       main[208] = 0x2015;
+       main[209] = 0x00B9;
+       main[210] = 0x00AE;
+       main[211] = 0x00A9;
+       main[212] = 0x2122;
+       main[213] = 0x266A;
+       main[214] = 0x00AC;
+       main[215] = 0x00A6;
+       main[220] = 0x215B;
+       main[221] = 0x215C;
+       main[222] = 0x215D;
+       main[223] = 0x215E;
+       main[224] = 0x2126;
+       main[225] = 0x00C6;
+       main[226] = 0x0110;
+       main[227] = 0x00AA;
+       main[228] = 0x0126;
+       main[230] = 0x0132;
+       main[231] = 0x013F;
+       main[232] = 0x0141;
+       main[233] = 0x00D8;
+       main[234] = 0x0152;
+       main[235] = 0x00BA;
+       main[236] = 0x00DE;
+       main[237] = 0x0166;
+       main[238] = 0x014A;
+       main[239] = 0x0149;
+       main[240] = 0x0138;
+       main[241] = 0x00E6;
+       main[242] = 0x0111;
+       main[243] = 0x00F0;
+       main[244] = 0x0127;
+       main[245] = 0x0131;
+       main[246] = 0x0133;
+       main[247] = 0x0140;
+       main[248] = 0x0142;
+       main[249] = 0x00F8;
+       main[250] = 0x0153;
+       main[251] = 0x00DF;
+       main[252] = 0x00FE;
+       main[253] = 0x0167;
+       main[254] = 0x014B;
+       main[255] = 0x00AD;
+}
diff --git a/src/iso6937_tables.h b/src/iso6937_tables.h
new file mode 100644 (file)
index 0000000..58c8c4c
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+/* THIS FILE WAS AUTO-GENERATED BY iso6937.py */
+
+#include <map>
+
+namespace sub {
+
+extern void make_iso6937_tables ();
+
+namespace iso6937 {
+
+extern std::map<char, wchar_t> main;
+extern std::map<char, wchar_t> grave;
+extern std::map<char, wchar_t> acute;
+extern std::map<char, wchar_t> circumflex;
+extern std::map<char, wchar_t> tilde;
+extern std::map<char, wchar_t> macron;
+extern std::map<char, wchar_t> breve;
+extern std::map<char, wchar_t> dot;
+extern std::map<char, wchar_t> diaeresis;
+extern std::map<char, wchar_t> ring;
+extern std::map<char, wchar_t> cedilla;
+extern std::map<char, wchar_t> double_acute;
+extern std::map<char, wchar_t> ogonek;
+extern std::map<char, wchar_t> caron;
+
+}
+}
index 3042a9328b828362f3b79016f54c5428b461a737..0d97e2e85bdbcf29c9a708ab7604aa2eb3b022c4 100644 (file)
 */
 
 #include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/locale.hpp>
 #include "stl_binary_reader.h"
 #include "exceptions.h"
+#include "iso6937.h"
 #include "compose.hpp"
 
 using std::map;
@@ -27,13 +30,15 @@ using std::cout;
 using std::string;
 using std::istream;
 using boost::lexical_cast;
+using boost::algorithm::replace_all;
+using boost::locale::conv::utf_to_utf;
 using namespace sub;
 
-template <class T>
-T
-string_to_value (string k, map<string, STLCode<T> > m, string name)
+template <class E, class F>
+E
+file_to_enum (F k, map<F, STLCode<E> > m, string name)
 {
-       typename map<string, STLCode<T> >::const_iterator i = m.find (k);
+       typename map<F, STLCode<E> >::const_iterator i = m.find (k);
        if (i == m.end ()) {
                throw STLError (String::compose ("Unknown %1 %2 in binary STL file", name, k));
        }
@@ -41,11 +46,11 @@ string_to_value (string k, map<string, STLCode<T> > m, string name)
        return i->second.value;
 }
 
-template <class T>
+template <class E, class F>
 string
-value_to_description (T v, map<string, STLCode<T> > const & m)
+enum_to_description (E v, map<F, STLCode<E> > const & m)
 {
-       for (typename map<string, STLCode<T> >::const_iterator i = m.begin(); i != m.end(); ++i) {
+       for (typename map<F, STLCode<E> >::const_iterator i = m.begin(); i != m.end(); ++i) {
                if (i->second.value == v) {
                        return i->second.description;
                }
@@ -54,27 +59,27 @@ value_to_description (T v, map<string, STLCode<T> > const & m)
        return "";
 }
 
-template <class T>
+template <class E, class F>
 void
-code (map<string, STLCode<T> >& m, string k, T v, string d)
+code (map<F, STLCode<E> >& m, F k, E v, string d)
 {
-       m[k] = STLCode<T> (v, d);
+       m[k] = STLCode<E> (v, d);
 }
 
 
 STLBinaryReader::STLBinaryReader (istream& in)
        : _buffer (new unsigned char[1024])
 {
-       create_maps ();
+       create_tables ();
        
        in.read ((char *) _buffer, 1024);
        if (in.gcount() != 1024) {
                throw STLError ("Could not read GSI block from binary STL file");
        }
                
-       code_page_number = atoi (fixed_string (0, 3).c_str ());
+       code_page_number = atoi (get_string (0, 3).c_str ());
        
-       string const dfc = fixed_string (3, 8);
+       string const dfc = get_string (3, 8);
        if (dfc == "STL24.01") {
                frame_rate = 24;
        } else if (dfc == "STL25.01") {
@@ -85,34 +90,68 @@ STLBinaryReader::STLBinaryReader (istream& in)
                throw STLError (String::compose ("Unknown disk format code %1 in binary STL file", dfc));
        }
 
-       display_standard = string_to_value (fixed_string (11, 1), _display_standard_map, "display standard code");
-       language_group = string_to_value (fixed_string (12, 2), _language_group_map, "character code");
-       language = string_to_value (fixed_string (14, 2), _language_map, "language code");
-       original_programme_title = fixed_string (16, 32);
-       original_episode_title = fixed_string (48, 32);
-       translated_programme_title = fixed_string (80, 32);
-       translated_episode_title = fixed_string (112, 32);
-       translator_name = fixed_string (144, 32);
-       translator_contact_details = fixed_string (176, 32);
-       subtitle_list_reference_code = fixed_string (208, 32);
-       creation_date = fixed_string (224, 6);
-       revision_date = fixed_string (230, 6);
-       revision_number = fixed_string (236, 2);
-
-       tti_blocks = atoi (fixed_string (238, 6).c_str ());
-       number_of_subtitles = atoi (fixed_string (243, 5).c_str ());
-       subtitle_groups = atoi (fixed_string (248, 3).c_str ());
-       maximum_characters = atoi (fixed_string (251, 2).c_str ());
-       maximum_rows = atoi (fixed_string (253, 2).c_str ());
-       timecode_status = string_to_value (fixed_string (255, 1), _timecode_status_map, "timecode status code");
-       start_of_programme = fixed_string (256, 8);
-       first_in_cue = fixed_string (256, 8);
-       disks = atoi (fixed_string (272, 1).c_str ());
-       disk_sequence_number = atoi (fixed_string (273, 1).c_str ());
-       country_of_origin = fixed_string (274, 3);
-       publisher = fixed_string (277, 32);
-       editor_name = fixed_string (309, 32);
-       editor_contact_details = fixed_string (341, 32);
+       display_standard = file_to_enum (get_string (11, 1), _display_standard_map, "display standard code");
+       language_group = file_to_enum (get_string (12, 2), _language_group_map, "character code");
+       language = file_to_enum (get_string (14, 2), _language_map, "language code");
+       original_programme_title = get_string (16, 32);
+       original_episode_title = get_string (48, 32);
+       translated_programme_title = get_string (80, 32);
+       translated_episode_title = get_string (112, 32);
+       translator_name = get_string (144, 32);
+       translator_contact_details = get_string (176, 32);
+       subtitle_list_reference_code = get_string (208, 32);
+       creation_date = get_string (224, 6);
+       revision_date = get_string (230, 6);
+       revision_number = get_string (236, 2);
+
+       tti_blocks = atoi (get_string (238, 5).c_str ());
+       number_of_subtitles = atoi (get_string (243, 5).c_str ());
+       subtitle_groups = atoi (get_string (248, 3).c_str ());
+       maximum_characters = atoi (get_string (251, 2).c_str ());
+       maximum_rows = atoi (get_string (253, 2).c_str ());
+       timecode_status = file_to_enum (get_string (255, 1), _timecode_status_map, "timecode status code");
+       start_of_programme = get_string (256, 8);
+       first_in_cue = get_string (256, 8);
+       disks = atoi (get_string (272, 1).c_str ());
+       disk_sequence_number = atoi (get_string (273, 1).c_str ());
+       country_of_origin = get_string (274, 3);
+       publisher = get_string (277, 32);
+       editor_name = get_string (309, 32);
+       editor_contact_details = get_string (341, 32);
+
+       for (int i = 0; i < tti_blocks; ++i) {
+               Subtitle sub;
+               
+               in.read ((char *) _buffer, 128);
+               if (in.gcount() != 128) {
+                       throw STLError ("Could not read TTI block from binary STL file");
+               }
+
+               if (file_to_enum (get_int (15, 1), _comment_map, "comment flag") == COMMENT_YES) {
+                       continue;
+               }
+
+               sub.from.frame = get_timecode (5);
+               sub.to.frame = get_timecode (9);
+               sub.line = get_int (13, 1);
+
+               /* XXX: justification, effects */
+
+               string s = get_string (16, 112);
+               
+               /* 8Ah is a new line */
+               replace_all (s, "\x8a", "\n");
+
+               /* 8Fh is unused space, so trim the string to the first instance of that */
+               size_t unused = s.find_first_of ('\x8f');
+               if (unused != string::npos) {
+                       s = s.substr (0, unused);
+               }
+               
+               sub.text = utf_to_utf<char> (iso6937_to_utf16 (s.c_str()));
+
+               _subs.push_back (sub);
+       }
 }
 
 STLBinaryReader::~STLBinaryReader ()
@@ -121,7 +160,7 @@ STLBinaryReader::~STLBinaryReader ()
 }
 
 string
-STLBinaryReader::fixed_string (int offset, int length) const
+STLBinaryReader::get_string (int offset, int length) const
 {
        string s;
        for (int i = 0; i < length; ++i) {
@@ -131,6 +170,23 @@ STLBinaryReader::fixed_string (int offset, int length) const
        return s;
 }
 
+int
+STLBinaryReader::get_int (int offset, int length) const
+{
+       int v = 0;
+       for (int i = 0; i < length; ++i) {
+               v |= _buffer[offset + i] << (8 * i);
+       }
+
+       return v;
+}
+
+FrameTime
+STLBinaryReader::get_timecode (int offset) const
+{
+       return FrameTime (_buffer[offset], _buffer[offset + 1], _buffer[offset + 2], _buffer[offset + 3]);
+}
+
 map<string, string>
 STLBinaryReader::metadata () const
 {
@@ -138,9 +194,9 @@ STLBinaryReader::metadata () const
 
        m["Code page number"] = lexical_cast<string> (code_page_number);
        m["Frame rate"] = lexical_cast<string> (frame_rate);
-       m["Display standard"] = value_to_description (display_standard, _display_standard_map);
-       m["Language group"] = value_to_description (language_group, _language_group_map);
-       m["Language"] = value_to_description (language, _language_map);
+       m["Display standard"] = enum_to_description (display_standard, _display_standard_map);
+       m["Language group"] = enum_to_description (language_group, _language_group_map);
+       m["Language"] = enum_to_description (language, _language_map);
        m["Original programme title"] = original_programme_title;
        m["Original episode title"] = original_episode_title;
        m["Translated programme title"] = translated_programme_title;
@@ -156,7 +212,7 @@ STLBinaryReader::metadata () const
        m["Subtitle groups"] = lexical_cast<string> (subtitle_groups);
        m["Maximum characters"] = lexical_cast<string> (maximum_characters);
        m["Maximum rows"] = lexical_cast<string> (maximum_rows);
-       m["Timecode status"] = value_to_description (timecode_status, _timecode_status_map);
+       m["Timecode status"] = enum_to_description (timecode_status, _timecode_status_map);
        m["Start of programme"] = start_of_programme;
        m["First in cue"] = first_in_cue;
        m["Disks"] = lexical_cast<string> (disks);
@@ -170,123 +226,136 @@ STLBinaryReader::metadata () const
 }
 
 void
-STLBinaryReader::create_maps ()
+STLBinaryReader::create_tables ()
 {
-       code (_display_standard_map, " ", DISPLAY_STANDARD_UNDEFINED, "Undefined");
-       code (_display_standard_map, "0", DISPLAY_STANDARD_OPEN_SUBTITLING, "Open subtitling");
-       code (_display_standard_map, "1", DISPLAY_STANDARD_LEVEL_1_TELETEXT, "Level 1 teletext");
-       code (_display_standard_map, "2", DISPLAY_STANDARD_LEVEL_2_TELETEXT, "Level 2 teletext");
+       code<DisplayStandard, string> (_display_standard_map, " ", DISPLAY_STANDARD_UNDEFINED, "Undefined");
+       code<DisplayStandard, string> (_display_standard_map, "0", DISPLAY_STANDARD_OPEN_SUBTITLING, "Open subtitling");
+       code<DisplayStandard, string> (_display_standard_map, "1", DISPLAY_STANDARD_LEVEL_1_TELETEXT, "Level 1 teletext");
+       code<DisplayStandard, string> (_display_standard_map, "2", DISPLAY_STANDARD_LEVEL_2_TELETEXT, "Level 2 teletext");
        
-       code (_language_group_map, "00", LANGUAGE_GROUP_LATIN, "Latin");
-       code (_language_group_map, "01", LANGUAGE_GROUP_LATIN_CYRILLIC, "Latin/Cyrillic");
-       code (_language_group_map, "02", LANGUAGE_GROUP_LATIN_ARABIC, "Latin/Arabic");
-       code (_language_group_map, "03", LANGUAGE_GROUP_LATIN_GREEK, "Latin/Greek");
-       code (_language_group_map, "04", LANGUAGE_GROUP_LATIN_HEBREW, "Latin/Hebrew");
+       code<LanguageGroup, string> (_language_group_map, "00", LANGUAGE_GROUP_LATIN, "Latin");
+       code<LanguageGroup, string> (_language_group_map, "01", LANGUAGE_GROUP_LATIN_CYRILLIC, "Latin/Cyrillic");
+       code<LanguageGroup, string> (_language_group_map, "02", LANGUAGE_GROUP_LATIN_ARABIC, "Latin/Arabic");
+       code<LanguageGroup, string> (_language_group_map, "03", LANGUAGE_GROUP_LATIN_GREEK, "Latin/Greek");
+       code<LanguageGroup, string> (_language_group_map, "04", LANGUAGE_GROUP_LATIN_HEBREW, "Latin/Hebrew");
        
-       code (_language_map, "00", LANGUAGE_UNKNOWN, "Unknown");
-       code (_language_map, "01", LANGUAGE_ALBANIAN, "Albanian");
-       code (_language_map, "02", LANGUAGE_BRETON, "Breton");
-       code (_language_map, "03", LANGUAGE_CATALAN, "Catalan");
-       code (_language_map, "04", LANGUAGE_CROATIAN, "Croatian");
-       code (_language_map, "05", LANGUAGE_WELSH, "Welsh");
-       code (_language_map, "06", LANGUAGE_CZECH, "Czech");
-       code (_language_map, "07", LANGUAGE_DANISH, "Danish");
-       code (_language_map, "08", LANGUAGE_GERMAN, "German");
-       code (_language_map, "09", LANGUAGE_ENGLISH, "English");
-       code (_language_map, "0A", LANGUAGE_SPANISH, "Spanish");
-       code (_language_map, "0B", LANGUAGE_ESPERANTO, "Esperanto");
-       code (_language_map, "0C", LANGUAGE_ESTONIAN, "Estonian");
-       code (_language_map, "0D", LANGUAGE_BASQUE, "Basque");
-       code (_language_map, "0E", LANGUAGE_FAROESE, "Faroese");
-       code (_language_map, "0F", LANGUAGE_FRENCH, "French");
-       code (_language_map, "10", LANGUAGE_FRISIAN, "Frisian");
-       code (_language_map, "11", LANGUAGE_IRISH, "Irish");
-       code (_language_map, "12", LANGUAGE_GAELIC, "Gaelic");
-       code (_language_map, "13", LANGUAGE_GALACIAN, "Galacian");
-       code (_language_map, "14", LANGUAGE_ICELANDIC, "Icelandic");
-       code (_language_map, "15", LANGUAGE_ITALIAN, "Italian");
-       code (_language_map, "16", LANGUAGE_LAPPISH, "Lappish");
-       code (_language_map, "17", LANGUAGE_LATIN, "Latin");
-       code (_language_map, "18", LANGUAGE_LATVIAN, "Latvian");
-       code (_language_map, "19", LANGUAGE_LUXEMBORGIAN, "Luxemborgian");
-       code (_language_map, "1A", LANGUAGE_LITHUANIAN, "Lithuanian");
-       code (_language_map, "1B", LANGUAGE_HUNGARIAN, "Hungarian");
-       code (_language_map, "1C", LANGUAGE_MALTESE, "Maltese");
-       code (_language_map, "1D", LANGUAGE_DUTCH, "Dutch");
-       code (_language_map, "1E", LANGUAGE_NORWEGIAN, "Norwegian");
-       code (_language_map, "1F", LANGUAGE_OCCITAN, "Occitan");
-       code (_language_map, "20", LANGUAGE_POLISH, "Polish");
-       code (_language_map, "21", LANGUAGE_PORTUGESE, "Portugese");
-       code (_language_map, "22", LANGUAGE_ROMANIAN, "Romanian");
-       code (_language_map, "23", LANGUAGE_ROMANSH, "Romansh");
-       code (_language_map, "24", LANGUAGE_SERBIAN, "Serbian");
-       code (_language_map, "25", LANGUAGE_SLOVAK, "Slovak");
-       code (_language_map, "26", LANGUAGE_SLOVENIAN, "Slovenian");
-       code (_language_map, "27", LANGUAGE_FINNISH, "Finnish");
-       code (_language_map, "28", LANGUAGE_SWEDISH, "Swedish");
-       code (_language_map, "29", LANGUAGE_TURKISH, "Turkish");
-       code (_language_map, "2A", LANGUAGE_FLEMISH, "Flemish");
-       code (_language_map, "2B", LANGUAGE_WALLON, "Wallon");
-       code (_language_map, "7F", LANGUAGE_AMHARIC, "Amharic");
-       code (_language_map, "7E", LANGUAGE_ARABIC, "Arabic");
-       code (_language_map, "7D", LANGUAGE_ARMENIAN, "Armenian");
-       code (_language_map, "7C", LANGUAGE_ASSAMESE, "Assamese");
-       code (_language_map, "7B", LANGUAGE_AZERBAIJANI, "Azerbaijani");
-       code (_language_map, "7A", LANGUAGE_BAMBORA, "Bambora");
-       code (_language_map, "79", LANGUAGE_BIELORUSSIAN, "Bielorussian");
-       code (_language_map, "78", LANGUAGE_BENGALI, "Bengali");
-       code (_language_map, "77", LANGUAGE_BULGARIAN, "Bulgarian");
-       code (_language_map, "76", LANGUAGE_BURMESE, "Burmese");
-       code (_language_map, "75", LANGUAGE_CHINESE, "Chinese");
-       code (_language_map, "74", LANGUAGE_CHURASH, "Churash");
-       code (_language_map, "73", LANGUAGE_DARI, "Dari");
-       code (_language_map, "72", LANGUAGE_FULANI, "Fulani");
-       code (_language_map, "71", LANGUAGE_GEORGIAN, "Georgian");
-       code (_language_map, "70", LANGUAGE_GREEK, "Greek");
-       code (_language_map, "6F", LANGUAGE_GUJURATI, "Gujarati");
-       code (_language_map, "6E", LANGUAGE_GURANI, "Gurani");
-       code (_language_map, "6D", LANGUAGE_HAUSA, "Hausa");
-       code (_language_map, "6C", LANGUAGE_HEBREW, "Hebrew");
-       code (_language_map, "6B", LANGUAGE_HINDI, "Hindi");
-       code (_language_map, "6A", LANGUAGE_INDONESIAN, "Indonesian");
-       code (_language_map, "69", LANGUAGE_JAPANESE, "Japanese");
-       code (_language_map, "68", LANGUAGE_KANNADA, "Kannada");
-       code (_language_map, "67", LANGUAGE_KAZAKH, "Kazakh");
-       code (_language_map, "66", LANGUAGE_KHMER, "Khmer");
-       code (_language_map, "65", LANGUAGE_KOREAN, "Korean");
-       code (_language_map, "64", LANGUAGE_LAOTIAN, "Laotian");
-       code (_language_map, "63", LANGUAGE_MACEDONIAN, "Macedonian");
-       code (_language_map, "62", LANGUAGE_MALAGASAY, "Malagasay");
-       code (_language_map, "61", LANGUAGE_MALAYSIAN, "Malaysian");
-       code (_language_map, "60", LANGUAGE_MOLDAVIAN, "Moldavian");
-       code (_language_map, "5F", LANGUAGE_MARATHI, "Marathi");
-       code (_language_map, "5E", LANGUAGE_NDEBELE, "Ndebele");
-       code (_language_map, "5D", LANGUAGE_NEPALI, "Nepali");
-       code (_language_map, "5C", LANGUAGE_ORIYA, "Oriya");
-       code (_language_map, "5B", LANGUAGE_PAPAMIENTO, "Papamiento");
-       code (_language_map, "5A", LANGUAGE_PERSIAN, "Persian");
-       code (_language_map, "59", LANGUAGE_PUNJABI, "Punjabi");
-       code (_language_map, "58", LANGUAGE_PUSHTU, "Pushtu");
-       code (_language_map, "57", LANGUAGE_QUECHUA, "Quechua");
-       code (_language_map, "56", LANGUAGE_RUSSIAN, "Russian");
-       code (_language_map, "55", LANGUAGE_RUTHENIAN, "Ruthenian");
-       code (_language_map, "54", LANGUAGE_SERBO_CROAT, "Serbo Croat");
-       code (_language_map, "53", LANGUAGE_SHONA, "Shona");
-       code (_language_map, "52", LANGUAGE_SINHALESE, "Sinhalese");
-       code (_language_map, "51", LANGUAGE_SOMALI, "Somali");
-       code (_language_map, "50", LANGUAGE_SRANAN_TONGO, "Sranan Tongo");
-       code (_language_map, "4F", LANGUAGE_SWAHILI, "Swahili");
-       code (_language_map, "4E", LANGUAGE_TADZHIK, "Tadzhik");
-       code (_language_map, "4D", LANGUAGE_TAMIL, "Tamil");
-       code (_language_map, "4C", LANGUAGE_TATAR, "Tatar");
-       code (_language_map, "4B", LANGUAGE_TELUGU, "Telugu");
-       code (_language_map, "4A", LANGUAGE_THAI, "Thai");
-       code (_language_map, "49", LANGUAGE_UKRANIAN, "Ukranian");
-       code (_language_map, "48", LANGUAGE_URDU, "Urdu");
-       code (_language_map, "47", LANGUAGE_UZBEK, "Uzbek");
-       code (_language_map, "46", LANGUAGE_VIETNAMESE, "Vietnamese");
-       code (_language_map, "45", LANGUAGE_ZULU, "Zulu");
-
-       code (_timecode_status_map, "0", TIMECODE_STATUS_NOT_INTENDED_FOR_USE, "Not intended for use");
-       code (_timecode_status_map, "1", TIMECODE_STATUS_INTENDED_FOR_USE, "Intended for use");
+       code<Language, string> (_language_map, "00", LANGUAGE_UNKNOWN, "Unknown");
+       code<Language, string> (_language_map, "01", LANGUAGE_ALBANIAN, "Albanian");
+       code<Language, string> (_language_map, "02", LANGUAGE_BRETON, "Breton");
+       code<Language, string> (_language_map, "03", LANGUAGE_CATALAN, "Catalan");
+       code<Language, string> (_language_map, "04", LANGUAGE_CROATIAN, "Croatian");
+       code<Language, string> (_language_map, "05", LANGUAGE_WELSH, "Welsh");
+       code<Language, string> (_language_map, "06", LANGUAGE_CZECH, "Czech");
+       code<Language, string> (_language_map, "07", LANGUAGE_DANISH, "Danish");
+       code<Language, string> (_language_map, "08", LANGUAGE_GERMAN, "German");
+       code<Language, string> (_language_map, "09", LANGUAGE_ENGLISH, "English");
+       code<Language, string> (_language_map, "0A", LANGUAGE_SPANISH, "Spanish");
+       code<Language, string> (_language_map, "0B", LANGUAGE_ESPERANTO, "Esperanto");
+       code<Language, string> (_language_map, "0C", LANGUAGE_ESTONIAN, "Estonian");
+       code<Language, string> (_language_map, "0D", LANGUAGE_BASQUE, "Basque");
+       code<Language, string> (_language_map, "0E", LANGUAGE_FAROESE, "Faroese");
+       code<Language, string> (_language_map, "0F", LANGUAGE_FRENCH, "French");
+       code<Language, string> (_language_map, "10", LANGUAGE_FRISIAN, "Frisian");
+       code<Language, string> (_language_map, "11", LANGUAGE_IRISH, "Irish");
+       code<Language, string> (_language_map, "12", LANGUAGE_GAELIC, "Gaelic");
+       code<Language, string> (_language_map, "13", LANGUAGE_GALACIAN, "Galacian");
+       code<Language, string> (_language_map, "14", LANGUAGE_ICELANDIC, "Icelandic");
+       code<Language, string> (_language_map, "15", LANGUAGE_ITALIAN, "Italian");
+       code<Language, string> (_language_map, "16", LANGUAGE_LAPPISH, "Lappish");
+       code<Language, string> (_language_map, "17", LANGUAGE_LATIN, "Latin");
+       code<Language, string> (_language_map, "18", LANGUAGE_LATVIAN, "Latvian");
+       code<Language, string> (_language_map, "19", LANGUAGE_LUXEMBORGIAN, "Luxemborgian");
+       code<Language, string> (_language_map, "1A", LANGUAGE_LITHUANIAN, "Lithuanian");
+       code<Language, string> (_language_map, "1B", LANGUAGE_HUNGARIAN, "Hungarian");
+       code<Language, string> (_language_map, "1C", LANGUAGE_MALTESE, "Maltese");
+       code<Language, string> (_language_map, "1D", LANGUAGE_DUTCH, "Dutch");
+       code<Language, string> (_language_map, "1E", LANGUAGE_NORWEGIAN, "Norwegian");
+       code<Language, string> (_language_map, "1F", LANGUAGE_OCCITAN, "Occitan");
+       code<Language, string> (_language_map, "20", LANGUAGE_POLISH, "Polish");
+       code<Language, string> (_language_map, "21", LANGUAGE_PORTUGESE, "Portugese");
+       code<Language, string> (_language_map, "22", LANGUAGE_ROMANIAN, "Romanian");
+       code<Language, string> (_language_map, "23", LANGUAGE_ROMANSH, "Romansh");
+       code<Language, string> (_language_map, "24", LANGUAGE_SERBIAN, "Serbian");
+       code<Language, string> (_language_map, "25", LANGUAGE_SLOVAK, "Slovak");
+       code<Language, string> (_language_map, "26", LANGUAGE_SLOVENIAN, "Slovenian");
+       code<Language, string> (_language_map, "27", LANGUAGE_FINNISH, "Finnish");
+       code<Language, string> (_language_map, "28", LANGUAGE_SWEDISH, "Swedish");
+       code<Language, string> (_language_map, "29", LANGUAGE_TURKISH, "Turkish");
+       code<Language, string> (_language_map, "2A", LANGUAGE_FLEMISH, "Flemish");
+       code<Language, string> (_language_map, "2B", LANGUAGE_WALLON, "Wallon");
+       code<Language, string> (_language_map, "7F", LANGUAGE_AMHARIC, "Amharic");
+       code<Language, string> (_language_map, "7E", LANGUAGE_ARABIC, "Arabic");
+       code<Language, string> (_language_map, "7D", LANGUAGE_ARMENIAN, "Armenian");
+       code<Language, string> (_language_map, "7C", LANGUAGE_ASSAMESE, "Assamese");
+       code<Language, string> (_language_map, "7B", LANGUAGE_AZERBAIJANI, "Azerbaijani");
+       code<Language, string> (_language_map, "7A", LANGUAGE_BAMBORA, "Bambora");
+       code<Language, string> (_language_map, "79", LANGUAGE_BIELORUSSIAN, "Bielorussian");
+       code<Language, string> (_language_map, "78", LANGUAGE_BENGALI, "Bengali");
+       code<Language, string> (_language_map, "77", LANGUAGE_BULGARIAN, "Bulgarian");
+       code<Language, string> (_language_map, "76", LANGUAGE_BURMESE, "Burmese");
+       code<Language, string> (_language_map, "75", LANGUAGE_CHINESE, "Chinese");
+       code<Language, string> (_language_map, "74", LANGUAGE_CHURASH, "Churash");
+       code<Language, string> (_language_map, "73", LANGUAGE_DARI, "Dari");
+       code<Language, string> (_language_map, "72", LANGUAGE_FULANI, "Fulani");
+       code<Language, string> (_language_map, "71", LANGUAGE_GEORGIAN, "Georgian");
+       code<Language, string> (_language_map, "70", LANGUAGE_GREEK, "Greek");
+       code<Language, string> (_language_map, "6F", LANGUAGE_GUJURATI, "Gujarati");
+       code<Language, string> (_language_map, "6E", LANGUAGE_GURANI, "Gurani");
+       code<Language, string> (_language_map, "6D", LANGUAGE_HAUSA, "Hausa");
+       code<Language, string> (_language_map, "6C", LANGUAGE_HEBREW, "Hebrew");
+       code<Language, string> (_language_map, "6B", LANGUAGE_HINDI, "Hindi");
+       code<Language, string> (_language_map, "6A", LANGUAGE_INDONESIAN, "Indonesian");
+       code<Language, string> (_language_map, "69", LANGUAGE_JAPANESE, "Japanese");
+       code<Language, string> (_language_map, "68", LANGUAGE_KANNADA, "Kannada");
+       code<Language, string> (_language_map, "67", LANGUAGE_KAZAKH, "Kazakh");
+       code<Language, string> (_language_map, "66", LANGUAGE_KHMER, "Khmer");
+       code<Language, string> (_language_map, "65", LANGUAGE_KOREAN, "Korean");
+       code<Language, string> (_language_map, "64", LANGUAGE_LAOTIAN, "Laotian");
+       code<Language, string> (_language_map, "63", LANGUAGE_MACEDONIAN, "Macedonian");
+       code<Language, string> (_language_map, "62", LANGUAGE_MALAGASAY, "Malagasay");
+       code<Language, string> (_language_map, "61", LANGUAGE_MALAYSIAN, "Malaysian");
+       code<Language, string> (_language_map, "60", LANGUAGE_MOLDAVIAN, "Moldavian");
+       code<Language, string> (_language_map, "5F", LANGUAGE_MARATHI, "Marathi");
+       code<Language, string> (_language_map, "5E", LANGUAGE_NDEBELE, "Ndebele");
+       code<Language, string> (_language_map, "5D", LANGUAGE_NEPALI, "Nepali");
+       code<Language, string> (_language_map, "5C", LANGUAGE_ORIYA, "Oriya");
+       code<Language, string> (_language_map, "5B", LANGUAGE_PAPAMIENTO, "Papamiento");
+       code<Language, string> (_language_map, "5A", LANGUAGE_PERSIAN, "Persian");
+       code<Language, string> (_language_map, "59", LANGUAGE_PUNJABI, "Punjabi");
+       code<Language, string> (_language_map, "58", LANGUAGE_PUSHTU, "Pushtu");
+       code<Language, string> (_language_map, "57", LANGUAGE_QUECHUA, "Quechua");
+       code<Language, string> (_language_map, "56", LANGUAGE_RUSSIAN, "Russian");
+       code<Language, string> (_language_map, "55", LANGUAGE_RUTHENIAN, "Ruthenian");
+       code<Language, string> (_language_map, "54", LANGUAGE_SERBO_CROAT, "Serbo Croat");
+       code<Language, string> (_language_map, "53", LANGUAGE_SHONA, "Shona");
+       code<Language, string> (_language_map, "52", LANGUAGE_SINHALESE, "Sinhalese");
+       code<Language, string> (_language_map, "51", LANGUAGE_SOMALI, "Somali");
+       code<Language, string> (_language_map, "50", LANGUAGE_SRANAN_TONGO, "Sranan Tongo");
+       code<Language, string> (_language_map, "4F", LANGUAGE_SWAHILI, "Swahili");
+       code<Language, string> (_language_map, "4E", LANGUAGE_TADZHIK, "Tadzhik");
+       code<Language, string> (_language_map, "4D", LANGUAGE_TAMIL, "Tamil");
+       code<Language, string> (_language_map, "4C", LANGUAGE_TATAR, "Tatar");
+       code<Language, string> (_language_map, "4B", LANGUAGE_TELUGU, "Telugu");
+       code<Language, string> (_language_map, "4A", LANGUAGE_THAI, "Thai");
+       code<Language, string> (_language_map, "49", LANGUAGE_UKRANIAN, "Ukranian");
+       code<Language, string> (_language_map, "48", LANGUAGE_URDU, "Urdu");
+       code<Language, string> (_language_map, "47", LANGUAGE_UZBEK, "Uzbek");
+       code<Language, string> (_language_map, "46", LANGUAGE_VIETNAMESE, "Vietnamese");
+       code<Language, string> (_language_map, "45", LANGUAGE_ZULU, "Zulu");
+
+       code<TimecodeStatus, string> (_timecode_status_map, "0", TIMECODE_STATUS_NOT_INTENDED_FOR_USE, "Not intended for use");
+       code<TimecodeStatus, string> (_timecode_status_map, "1", TIMECODE_STATUS_INTENDED_FOR_USE, "Intended for use");
+
+       code<CumulativeStatus, int> (_cumulative_status_map, 0, CUMULATIVE_STATUS_NOT_CUMULATIVE, "Not part of a cumulative set");
+       code<CumulativeStatus, int> (_cumulative_status_map, 1, CUMULATIVE_STATUS_FIRST, "First subtitle of a cumulative set");
+       code<CumulativeStatus, int> (_cumulative_status_map, 2, CUMULATIVE_STATUS_INTERMEDIATE, "Intermediate subtitle of a cumulative set");
+       code<CumulativeStatus, int> (_cumulative_status_map, 3, CUMULATIVE_STATUS_LAST, "Last subtitle of a cumulative set");
+
+       code<Justification, int> (_justification_map, 0, JUSTIFICATION_NONE, "No justification");
+       code<Justification, int> (_justification_map, 1, JUSTIFICATION_LEFT, "Left justification");
+       code<Justification, int> (_justification_map, 2, JUSTIFICATION_CENTRE, "Centre justification");
+       code<Justification, int> (_justification_map, 3, JUSTIFICATION_CENTRE, "Right justification");
+
+       code<Comment, int> (_comment_map, 0, COMMENT_NO, "Not a comment");
+       code<Comment, int> (_comment_map, 1, COMMENT_YES, "Is a comment");
 }
index 0491cb8d393caf11a8e102db26b6f77ecbb42f71..b049b93668a5afa43ba09e3f9624bbbcbd776808 100644 (file)
@@ -173,6 +173,25 @@ public:
                TIMECODE_STATUS_INTENDED_FOR_USE
        };
 
+       enum CumulativeStatus {
+               CUMULATIVE_STATUS_NOT_CUMULATIVE,
+               CUMULATIVE_STATUS_FIRST,
+               CUMULATIVE_STATUS_INTERMEDIATE,
+               CUMULATIVE_STATUS_LAST
+       };
+
+       enum Justification {
+               JUSTIFICATION_NONE,
+               JUSTIFICATION_LEFT,
+               JUSTIFICATION_CENTRE,
+               JUSTIFICATION_RIGHT
+       };
+
+       enum Comment {
+               COMMENT_NO,
+               COMMENT_YES
+       };
+
        int code_page_number;
        int frame_rate;
        DisplayStandard display_standard;
@@ -204,13 +223,19 @@ public:
        std::string editor_contact_details;
 
 private:
-       std::string fixed_string (int, int) const;
-       void create_maps ();
+       std::string get_string (int, int) const;
+       int get_int (int, int) const;
+       FrameTime get_timecode (int) const;
+       
+       void create_tables ();
        
        std::map<std::string, STLCode<DisplayStandard> > _display_standard_map;
        std::map<std::string, STLCode<LanguageGroup> > _language_group_map;
        std::map<std::string, STLCode<Language> > _language_map;
        std::map<std::string, STLCode<TimecodeStatus> > _timecode_status_map;
+       std::map<int, STLCode<CumulativeStatus> > _cumulative_status_map;
+       std::map<int, STLCode<Justification> > _justification_map;
+       std::map<int, STLCode<Comment> > _comment_map;
 
        unsigned char* _buffer;
 };
index 70a8eb6d0ed73f98ef8547353b3bfb867cbb2219..fc08ac7ce9ad9b001b234a0c68d9750a63e50e98 100644 (file)
@@ -42,6 +42,7 @@ public:
                , line (0)
        {}
 
+       /** Subtitle text in UTF-8 */
        std::string text;
        std::string font;
 
@@ -71,7 +72,7 @@ public:
        bool bold;      ///< true to use a bold version of font
        bool italic;    ///< true to use an italic version of font
        bool underline; ///< true to underline
-       int line;
+       int line;       ///< line number, starting from 0
 
        /** from time */
        struct {
index d834c4329a42e0e11f8ed81b04f5aa43cc96f6c9..9cb874a10f5d8af827da61a02616fc1d56458bfa 100644 (file)
@@ -8,7 +8,7 @@ def build(bld):
 
     obj.name = 'libsub'
     obj.target = 'sub'
-    obj.uselib = 'CXML BOOST_FILESYSTEM'
+    obj.uselib = 'CXML BOOST_FILESYSTEM BOOST_LOCALE'
     obj.export_includes = ['.']
     obj.source = """
                  colour.cc
@@ -16,6 +16,8 @@ def build(bld):
                  dcp_reader.cc
                  effect.cc
                  frame_time.cc
+                 iso6937.cc
+                 iso6937_tables.cc
                  metric_time.cc
                  reader.cc
                  reader_factory.cc
diff --git a/test/iso6937_test.cc b/test/iso6937_test.cc
new file mode 100644 (file)
index 0000000..e8563b8
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <boost/test/unit_test.hpp>
+#include <boost/locale.hpp>
+#include "iso6937.h"
+
+using std::cout;
+using boost::locale::conv::utf_to_utf;
+
+BOOST_AUTO_TEST_CASE (iso6937_test)
+{
+       BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Hello world")), "Hello world");
+       BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Testing \xA9testing\xB9")), "Testing ‘testing’");
+       BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("All must have \xCB""cedillas")), "All must have çedillas");
+       BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("M\xC8otorhead")), "Mötorhead");
+       BOOST_CHECK_EQUAL (utf_to_utf<char> (sub::iso6937_to_utf16 ("Pass\nnewlines\nthrough")), "Pass\nnewlines\nthrough");
+}
index f6714afe1b01c5fa05689b81c7716edf45ded631..6bb8b5bb4db3d812e2f5d64c23fd21a61fd152ce 100644 (file)
@@ -23,6 +23,7 @@
 #include <boost/filesystem.hpp>
 #include <fstream>
 #include <string>
+#include "iso6937_tables.h"
 
 using std::string;
 using std::cerr;
@@ -40,6 +41,8 @@ struct TestConfig
                } else {
                        BOOST_TEST_MESSAGE ("Private data libsub-test-private not found; some tests will not run");
                }
+
+               sub::make_iso6937_tables ();
        }
 };
 
index 630a6a004ad4c58a934dd950c43ae09c39f60bd3..c3ee7f4ce7c7d0edee6af73e6ded6cbdee0fc268 100644 (file)
@@ -18,6 +18,7 @@ def build(bld):
     obj.source = """
                  dcp_reader_test.cc
                  dcp_to_stl_text_test.cc
+                 iso6937_test.cc
                  stl_binary_reader_test.cc
                  stl_text_reader_test.cc
                  stl_text_writer_test.cc
index 6213da07626bcfcf39f65f6b91a1d5cdbb74c14d..b37203dc9615c491d09c4824aa0bd007ad957fbd 100644 (file)
@@ -27,6 +27,7 @@ using std::string;
 using std::cerr;
 using std::cout;
 using std::map;
+using std::list;
 using boost::shared_ptr;
 using namespace sub;
 
@@ -80,5 +81,10 @@ main (int argc, char* argv[])
                cout << i->first << ": " << i->second << "\n";
        }
 
+       list<sub::Subtitle> subs = reader->subtitles ();
+       for (list<sub::Subtitle>::const_iterator i = subs.begin(); i != subs.end(); ++i) {
+               cout << i->text << "\n";
+       }
+
        return 0;
 }
diff --git a/wscript b/wscript
index a664b8bba0487da1e916ac148f659ca5491bb75a..5f619b544549d958ddba56489c7b6ff147090229 100644 (file)
--- a/wscript
+++ b/wscript
@@ -49,6 +49,15 @@ def configure(conf):
                    lib = ['boost_filesystem', 'boost_system'],
                    uselib_store = 'BOOST_FILESYSTEM')
 
+    conf.check_cxx(fragment = """
+                             #include <boost/locale.hpp>\n
+                             int main() { boost::locale::conv::to_utf<char> ("a", "cp850"); }\n
+                             """,
+                   msg = 'Checking for boost locale library',
+                   libpath = '/usr/local/lib',
+                   lib = ['boost_locale', 'boost_system'],
+                   uselib_store = 'BOOST_LOCALE')
+
     conf.recurse('test')
 
 def build(bld):