Strip Unicode U+202B (right-to-left-embedding) code; it looks like DoM does RTL ...
[libsub.git] / src / iso6937.cc
index 048fd8419a20b6ff61d0bee8339bf93e4b2525bb..8409f9b00eabdfcfba37ee6246ccb542ef3801e3 100644 (file)
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2014 Carl Hetherington <cth@carlh.net>
+    Copyright (C) 2014-2015 Carl Hetherington <cth@carlh.net>
 
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
 
 */
 
-#include <string>
-#include <boost/optional.hpp>
 #include "iso6937_tables.h"
 #include "iso6937.h"
+#include <boost/optional.hpp>
+#include <boost/locale.hpp>
+#include <string>
+#include <iostream>
 
 using std::string;
 using std::cout;
+using std::wcout;
 using std::wstring;
+using std::map;
+using boost::optional;
+using boost::locale::conv::utf_to_utf;
 using namespace sub;
 
 wstring
 sub::iso6937_to_utf16 (string s)
 {
-       if (iso6937::grave.empty ()) {
+       if (iso6937::diacriticals.empty ()) {
                make_iso6937_tables ();
        }
-       
+
        wstring o;
 
        boost::optional<unsigned char> diacritical;
@@ -44,48 +50,7 @@ sub::iso6937_to_utf16 (string s)
                if (u >= 0xc1 && u <= 0xcf) {
                        diacritical = u;
                } else if (diacritical) {
-                       switch (diacritical.get ()) {
-                       case 0xC1:
-                               o += iso6937::grave[u];
-                               break;
-                       case 0xC2:
-                               o += iso6937::acute[u];
-                               break;
-                       case 0xC3:
-                               o += iso6937::circumflex[u];
-                               break;
-                       case 0xC4:
-                               o += iso6937::tilde[u];
-                               break;
-                       case 0xC5:
-                               o += iso6937::macron[u];
-                               break;
-                       case 0xC6:
-                               o += iso6937::breve[u];
-                               break;
-                       case 0xC7:
-                               o += iso6937::dot[u];
-                               break;
-                       case 0xC8:
-                               o += iso6937::diaeresis[u];
-                               break;
-                       case 0xCA:
-                               o += iso6937::ring[u];
-                               break;
-                       case 0xCB:
-                               o += iso6937::cedilla[u];
-                               break;
-                       case 0xCD:
-                               o += iso6937::double_acute[u];
-                               break;
-                       case 0xCE:
-                               o += iso6937::ogonek[u];
-                               break;
-                       case 0xCF:
-                               o += iso6937::caron[u];
-                               break;
-                       }
-
+                       o += (*iso6937::diacriticals[diacritical.get()])[u];
                        diacritical.reset ();
                } else {
                        o += iso6937::main[u];
@@ -96,3 +61,62 @@ sub::iso6937_to_utf16 (string s)
 
        return o;
 }
+
+static optional<char>
+find (map<char, wchar_t> const & m, wchar_t c)
+{
+       for (map<char, wchar_t>::const_iterator i = m.begin(); i != m.end(); ++i) {
+               if (i->second == c) {
+                       return i->first;
+               }
+       }
+
+       return optional<char> ();
+}
+
+string
+sub::utf16_to_iso6937 (wstring s)
+{
+       if (iso6937::diacriticals.empty ()) {
+               make_iso6937_tables ();
+       }
+
+       /* XXX: slow */
+
+       string o;
+       for (size_t i = 0; i < s.size(); ++i) {
+               optional<char> c = find (iso6937::main, s[i]);
+               if (c) {
+                       o += c.get ();
+               } else {
+                       for (map<char, map<char, wchar_t> *>::const_iterator j = iso6937::diacriticals.begin(); j != iso6937::diacriticals.end(); ++j) {
+                               c = find (*(j->second), s[i]);
+                               if (c) {
+                                       o += j->first;
+                                       o += c.get ();
+                                       break;
+                               }
+                       }
+               }
+
+               if (s[i] == 0x201e) {
+                       /* ISO6397 does not support German (lower) quotation mark (UTF 0x201e) so use
+                          a normal opening one (0x201c, which is 170 in ISO6937).
+                       */
+                       o += (char) 170;
+               } else if (s[i] == 0x2013 || s[i] == 0x2014) {
+                       /* ISO6397 does not support en- or em-dashes, so use a horizontal bar (0x2015,
+                          which is 208 in ISO6937).
+                       */
+                       o += (char) 208;
+               } else if (s[i] == 0x2010 || s[i] == 0x2011 || s[i] == 0x2012) {
+                       /* Similar story with hyphen, non-breaking hyphen, figure dash */
+                       o += '-';
+               } else if (s[i] == 0x2032) {
+                       /* And prime */
+                       o += '\'';
+               }
+       }
+
+       return o;
+}