Nearly pretty-print subtitle XML (though not in <Text> nodes).
authorCarl Hetherington <cth@carlh.net>
Tue, 15 Mar 2022 14:32:49 +0000 (15:32 +0100)
committerCarl Hetherington <cth@carlh.net>
Sat, 19 Mar 2022 22:04:35 +0000 (23:04 +0100)
This is an attempt to fix DoM bug #2205.

src/interop_subtitle_asset.cc
src/smpte_subtitle_asset.cc
src/subtitle_asset.cc
src/subtitle_asset.h
test/shared_subtitle_test.cc

index d51472845c66e92c93d63194e806941d83fecff9..bb0cad701fee2b67427aa67265e5c6c3a7b4946e 100644 (file)
@@ -125,7 +125,7 @@ InteropSubtitleAsset::xml_as_string () const
 
        subtitles_as_xml (root, 250, Standard::INTEROP);
 
-       return doc.write_to_string ("UTF-8");
+       return format_xml(doc, {});
 }
 
 
index f09d99f37ff21e474fc913704c55b17a1ef87809..0693323cf5861f3f2e2f4c0f5251379e66ba32e1 100644 (file)
@@ -344,8 +344,6 @@ SMPTESubtitleAsset::xml_as_string () const
 {
        xmlpp::Document doc;
        auto root = doc.create_root_node ("SubtitleReel");
-       root->set_namespace_declaration (subtitle_smpte_ns);
-       root->set_namespace_declaration ("http://www.w3.org/2001/XMLSchema", "xs");
 
        DCP_ASSERT (_xml_id);
        root->add_child("Id")->add_child_text("urn:uuid:" + *_xml_id);
@@ -374,7 +372,7 @@ SMPTESubtitleAsset::xml_as_string () const
 
        subtitles_as_xml (root->add_child("SubtitleList"), _time_code_rate, Standard::SMPTE);
 
-       return doc.write_to_string ("UTF-8");
+       return format_xml(doc, { {"", subtitle_smpte_ns}, {"xs", "http://www.w3.org/2001/XMLSchema"} });
 }
 
 
index 22781196ac5c485ca042ba8813895e06a85db7f1..de6332e6810d17b6c143211d6eaee4af91abcb8b 100644 (file)
 #include <boost/algorithm/string.hpp>
 #include <boost/lexical_cast.hpp>
 #include <boost/shared_array.hpp>
+#include <algorithm>
 
 
-using std::dynamic_pointer_cast;
-using std::string;
-using std::cout;
 using std::cerr;
+using std::cout;
+using std::dynamic_pointer_cast;
+using std::make_shared;
 using std::map;
+using std::pair;
 using std::shared_ptr;
+using std::string;
 using std::vector;
-using std::make_shared;
-using boost::optional;
 using boost::lexical_cast;
+using boost::optional;
 using namespace dcp;
 
 
@@ -799,3 +801,107 @@ SubtitleAsset::fix_empty_font_ids ()
                }
        }
 }
+
+
+struct State
+{
+       int indent;
+       string xml;
+       int disable_formatting;
+};
+
+
+static
+void
+format_xml_node (xmlpp::Node const* node, State& state)
+{
+       if (auto text_node = dynamic_cast<const xmlpp::TextNode*>(node)) {
+               state.xml += text_node->get_content();
+       } else if (auto element = dynamic_cast<const xmlpp::Element*>(node)) {
+               ++state.indent;
+
+               auto children = element->get_children();
+               auto const should_disable_formatting =
+                       std::any_of(
+                               children.begin(), children.end(),
+                               [](xmlpp::Node const* node) { return static_cast<bool>(dynamic_cast<const xmlpp::ContentNode*>(node)); }
+                               ) || element->get_name() == "Text";
+
+               if (!state.disable_formatting) {
+                       state.xml += "\n" + string(state.indent * 2, ' ');
+               }
+
+               state.xml += "<" + element->get_name();
+
+               for (auto attribute: element->get_attributes()) {
+                       state.xml += String::compose(" %1=\"%2\"", attribute->get_name().raw(), attribute->get_value().raw());
+               }
+
+               if (children.empty()) {
+                       state.xml += "/>";
+               } else {
+                       state.xml += ">";
+
+                       if (should_disable_formatting) {
+                               ++state.disable_formatting;
+                       }
+
+                       for (auto child: children) {
+                               format_xml_node(child, state);
+                       }
+
+                       if (!state.disable_formatting) {
+                               state.xml += "\n" + string(state.indent * 2, ' ');
+                       }
+
+                       state.xml += String::compose("</%1>", element->get_name().raw());
+
+                       if (should_disable_formatting) {
+                               --state.disable_formatting;
+                       }
+               }
+
+               --state.indent;
+       }
+}
+
+
+/** Format XML much as write_to_string_formatted() would do, except without adding any white space
+ *  to <Text> nodes.  This is an attempt to avoid changing what is actually displayed as subtitles
+ *  while also formatting the XML in such a way as to avoid DoM bug 2205.
+ *
+ *  namespace is a list of namespaces for the root node; it would be nicer to set these up with
+ *  set_namespace_declaration in the caller and then to extract them here but I couldn't find a way
+ *  to get all namespaces with the libxml++ API.
+ */
+string
+SubtitleAsset::format_xml (xmlpp::Document const& document, vector<pair<string, string>> const& namespaces)
+{
+       auto root = document.get_root_node();
+
+       State state = {};
+       state.xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<" + root->get_name();
+
+       for (auto const& ns: namespaces) {
+               if (ns.first.empty()) {
+                       state.xml += String::compose(" xmlns=\"%1\"", ns.second);
+               } else {
+                       state.xml += String::compose(" xmlns:%1=\"%2\"", ns.first, ns.second);
+               }
+       }
+
+       for (auto attribute: root->get_attributes()) {
+               state.xml += String::compose(" %1=\"%2\"", attribute->get_name().raw(), attribute->get_value().raw());
+       }
+
+       state.xml += ">";
+
+       for (auto child: document.get_root_node()->get_children()) {
+               format_xml_node(child, state);
+       }
+
+       state.xml += String::compose("\n</%1>\n", root->get_name().raw());
+
+       return state.xml;
+}
+
index f51906e22aabfe49c87611a3531dee868d942182..88b5378cab2986661b27af9f38bd3371a3f24d64 100644 (file)
 #include <libcxml/cxml.h>
 #include <boost/shared_array.hpp>
 #include <map>
+#include <string>
+#include <utility>
+#include <vector>
 
 
 namespace xmlpp {
+       class Document;
        class Element;
 }
 
@@ -128,6 +132,8 @@ public:
                return _raw_xml;
        }
 
+       static std::string format_xml (xmlpp::Document const& document, std::vector<std::pair<std::string, std::string>> const& namespaces);
+
 protected:
        friend struct ::interop_dcp_font_test;
        friend struct ::smpte_dcp_font_test;
index f18b03db680db453fe8a40425f41d44f44647634..2231d631cc885f406b370709b12d462120bbedea 100644 (file)
@@ -167,3 +167,41 @@ BOOST_AUTO_TEST_CASE (pull_fonts_test3)
        BOOST_CHECK_EQUAL (sub1->font._values["size"], "42");
 }
 
+
+/* Check that subtitle XML is prettily formatted without inserting any white space into
+ * <Text> node, which I think has the potential to alter appearance.
+ */
+BOOST_AUTO_TEST_CASE (format_xml_test1)
+{
+       xmlpp::Document doc;
+       auto root = doc.create_root_node("Foo");
+       root->add_child("Empty");
+       root->add_child("Text")->add_child_text("Hello world");
+       root->add_child("Font")->add_child("Text")->add_child_text("Say what");
+       auto fred = root->add_child("Text")->add_child("Font");
+       fred->set_attribute("bob", "job");
+       fred->add_child_text("Fred");
+       fred->add_child("Text")->add_child_text("Jim");
+       fred->add_child_text("Sheila");
+       BOOST_REQUIRE_EQUAL (dcp::SubtitleAsset::format_xml(doc, { {"", "fred"}, {"jim", "sheila"} }),
+"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+"<Foo xmlns=\"fred\" xmlns:jim=\"sheila\">\n"
+"  <Empty/>\n"
+"  <Text>Hello world</Text>\n"
+"  <Font>\n"
+"    <Text>Say what</Text>\n"
+"  </Font>\n"
+"  <Text><Font bob=\"job\">Fred<Text>Jim</Text>Sheila</Font></Text>\n"
+"</Foo>\n");
+}
+
+
+BOOST_AUTO_TEST_CASE (format_xml_test2)
+{
+       xmlpp::DomParser parser;
+       auto path = private_test / "DKH_UT_EN20160601def.xml";
+       parser.parse_file(path.string().c_str());
+       auto document = parser.get_document();
+       check_xml (dcp::file_to_string(private_test / "DKH_UT_EN20160601def.reformatted.xml"), dcp::SubtitleAsset::format_xml(*document, {}), {});
+}
+