Validate XML with xerces.
[libdcp.git] / src / verify.cc
1 /*
2     Copyright (C) 2018-2019 Carl Hetherington <cth@carlh.net>
3
4     This file is part of libdcp.
5
6     libdcp is free software; you can redistribute it and/or modify
7     it under the terms of the GNU General Public License as published by
8     the Free Software Foundation; either version 2 of the License, or
9     (at your option) any later version.
10
11     libdcp is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14     GNU General Public License for more details.
15
16     You should have received a copy of the GNU General Public License
17     along with libdcp.  If not, see <http://www.gnu.org/licenses/>.
18
19     In addition, as a special exception, the copyright holders give
20     permission to link the code of portions of this program with the
21     OpenSSL library under certain conditions as described in each
22     individual source file, and distribute linked combinations
23     including the two.
24
25     You must obey the GNU General Public License in all respects
26     for all of the code used other than OpenSSL.  If you modify
27     file(s) with this exception, you may extend this exception to your
28     version of the file(s), but you are not obligated to do so.  If you
29     do not wish to do so, delete this exception statement from your
30     version.  If you delete this exception statement from all source
31     files in the program, then also delete it here.
32 */
33
34 #include "verify.h"
35 #include "dcp.h"
36 #include "cpl.h"
37 #include "reel.h"
38 #include "reel_picture_asset.h"
39 #include "reel_sound_asset.h"
40 #include "exceptions.h"
41 #include "compose.hpp"
42 #include "raw_convert.h"
43 #include <xercesc/util/PlatformUtils.hpp>
44 #include <xercesc/parsers/XercesDOMParser.hpp>
45 #include <xercesc/parsers/AbstractDOMParser.hpp>
46 #include <xercesc/sax/HandlerBase.hpp>
47 #include <xercesc/dom/DOMImplementation.hpp>
48 #include <xercesc/dom/DOMImplementationLS.hpp>
49 #include <xercesc/dom/DOMImplementationRegistry.hpp>
50 #include <xercesc/dom/DOMLSParser.hpp>
51 #include <xercesc/dom/DOMException.hpp>
52 #include <xercesc/dom/DOMDocument.hpp>
53 #include <xercesc/dom/DOMNodeList.hpp>
54 #include <xercesc/dom/DOMError.hpp>
55 #include <xercesc/dom/DOMLocator.hpp>
56 #include <xercesc/dom/DOMNamedNodeMap.hpp>
57 #include <xercesc/dom/DOMAttr.hpp>
58 #include <xercesc/dom/DOMErrorHandler.hpp>
59 #include <xercesc/framework/LocalFileInputSource.hpp>
60 #include <boost/noncopyable.hpp>
61 #include <boost/foreach.hpp>
62 #include <boost/algorithm/string.hpp>
63 #include <boost/regex.hpp>
64 #include <map>
65 #include <list>
66 #include <vector>
67 #include <iostream>
68
69 using std::list;
70 using std::vector;
71 using std::string;
72 using std::cout;
73 using std::map;
74 using boost::shared_ptr;
75 using boost::optional;
76 using boost::function;
77
78 using namespace dcp;
79 using namespace xercesc;
80
81 enum Result {
82         RESULT_GOOD,
83         RESULT_CPL_PKL_DIFFER,
84         RESULT_BAD
85 };
86
87 static
88 string
89 xml_ch_to_string (XMLCh const * a)
90 {
91         char* x = XMLString::transcode(a);
92         string const o(x);
93         XMLString::release(&x);
94         return o;
95 }
96
97 class XMLValidationError
98 {
99 public:
100         XMLValidationError (SAXParseException const & e)
101                 : _message (xml_ch_to_string(e.getMessage()))
102                 , _line (e.getLineNumber())
103                 , _column (e.getColumnNumber())
104         {
105
106         }
107
108         string message () const {
109                 return _message;
110         }
111
112         uint64_t line () const {
113                 return _line;
114         }
115
116         uint64_t column () const {
117                 return _column;
118         }
119
120 private:
121         string _message;
122         uint64_t _line;
123         uint64_t _column;
124 };
125
126
127 class DCPErrorHandler : public ErrorHandler
128 {
129 public:
130         void warning(const SAXParseException& e)
131         {
132                 maybe_add (XMLValidationError(e));
133         }
134
135         void error(const SAXParseException& e)
136         {
137                 maybe_add (XMLValidationError(e));
138         }
139
140         void fatalError(const SAXParseException& e)
141         {
142                 maybe_add (XMLValidationError(e));
143         }
144
145         void resetErrors() {}
146
147         list<XMLValidationError> errors () const {
148                 return _errors;
149         }
150
151 private:
152         void maybe_add (XMLValidationError e)
153         {
154                 /* XXX: nasty hack */
155                 if (
156                         e.message() ==
157                         "schema document '/home/carl/src/libdcp/xsd/xml.xsd' has different target namespace "
158                         "from the one specified in instance document 'http://www.w3.org/2001/03/xml.xsd'" ||
159                         e.message() ==
160                         "schema document '/home/carl/src/libdcp/xsd/xmldsig-core-schema.xsd' has different target namespace "
161                         "from the one specified in instance document 'http://www.w3.org/TR/2002/REC-xmldsig-core-20020212/xmldsig-core-schema.xsd'"
162                         ) {
163                         return;
164                 }
165
166                 _errors.push_back (e);
167         }
168
169         list<XMLValidationError> _errors;
170 };
171
172 class StringToXMLCh : public boost::noncopyable
173 {
174 public:
175         StringToXMLCh (string a)
176         {
177                 _buffer = XMLString::transcode(a.c_str());
178         }
179
180         ~StringToXMLCh ()
181         {
182                 XMLString::release (&_buffer);
183         }
184
185         XMLCh const * get () const {
186                 return _buffer;
187         }
188
189 private:
190         XMLCh* _buffer;
191 };
192
193 class LocalFileResolver : public EntityResolver
194 {
195 public:
196         LocalFileResolver (boost::filesystem::path xsd_dtd_directory)
197                 : _xsd_dtd_directory (xsd_dtd_directory)
198         {
199                 add("http://www.w3.org/2001/XMLSchema.dtd", "XMLSchema.dtd");
200                 add("http://www.w3.org/2001/03/xml.xsd", "xml.xsd");
201                 add("http://www.w3.org/TR/2002/REC-xmldsig-core-20020212/xmldsig-core-schema.xsd", "xmldsig-core-schema.xsd");
202         }
203
204         InputSource* resolveEntity(XMLCh const *, XMLCh const * system_id)
205         {
206                 string system_id_str = xml_ch_to_string (system_id);
207                 if (_files.find(system_id_str) == _files.end()) {
208                         return 0;
209                 }
210
211                 boost::filesystem::path p = _xsd_dtd_directory / _files[system_id_str];
212                 StringToXMLCh ch (p.string());
213                 return new LocalFileInputSource(ch.get());
214         }
215
216 private:
217         void add (string uri, string file)
218         {
219                 _files[uri] = file;
220         }
221
222         std::map<string, string> _files;
223         boost::filesystem::path _xsd_dtd_directory;
224 };
225
226 static
227 list<XMLValidationError>
228 validate_xml (boost::filesystem::path xml_file, boost::filesystem::path xsd_dtd_directory)
229 {
230         try {
231                 XMLPlatformUtils::Initialize ();
232         } catch (XMLException& e) {
233                 throw MiscError ("Failed to initialise xerces library");
234         }
235
236         DCPErrorHandler error_handler;
237
238         /* All the xerces objects in this scope must be destroyed before XMLPlatformUtils::Terminate() is called */
239         {
240                 XercesDOMParser parser;
241                 parser.setValidationScheme(XercesDOMParser::Val_Always);
242                 parser.setDoNamespaces(true);
243                 parser.setDoSchema(true);
244
245                 map<string, string> schema;
246                 schema["http://www.w3.org/2000/09/xmldsig#"] = "xmldsig-core-schema.xsd";
247                 schema["http://www.w3.org/TR/2002/REC-xmldsig-core-20020212/xmldsig-core-schema.xsd"] = "xmldsig-core-schema.xsd";
248                 schema["http://www.smpte-ra.org/schemas/429-7/2006/CPL"] = "SMPTE-429-7-2006-CPL.xsd";
249                 schema["http://www.w3.org/2001/03/xml.xsd"] = "xml.xsd";
250
251                 string locations;
252                 for (map<string, string>::const_iterator i = schema.begin(); i != schema.end(); ++i) {
253                         locations += i->first;
254                         locations += " ";
255                         boost::filesystem::path p = xsd_dtd_directory / i->second;
256                         locations += p.string() + " ";
257                 }
258
259                 parser.setExternalSchemaLocation(locations.c_str());
260                 parser.setValidationSchemaFullChecking(true);
261                 parser.setErrorHandler(&error_handler);
262
263                 LocalFileResolver resolver (xsd_dtd_directory);
264                 parser.setEntityResolver(&resolver);
265
266                 try {
267                         parser.resetDocumentPool();
268                         parser.parse(xml_file.string().c_str());
269                 } catch (XMLException& e) {
270                         throw MiscError(xml_ch_to_string(e.getMessage()));
271                 } catch (DOMException& e) {
272                         throw MiscError(xml_ch_to_string(e.getMessage()));
273                 } catch (...) {
274                         throw MiscError("Unknown exception from xerces");
275                 }
276
277         }
278
279         XMLPlatformUtils::Terminate ();
280
281         return error_handler.errors ();
282 }
283
284 static Result
285 verify_asset (shared_ptr<DCP> dcp, shared_ptr<ReelMXF> reel_mxf, function<void (float)> progress)
286 {
287         string const actual_hash = reel_mxf->asset_ref()->hash(progress);
288
289         list<shared_ptr<PKL> > pkls = dcp->pkls();
290         /* We've read this DCP in so it must have at least one PKL */
291         DCP_ASSERT (!pkls.empty());
292
293         shared_ptr<Asset> asset = reel_mxf->asset_ref().asset();
294
295         optional<string> pkl_hash;
296         BOOST_FOREACH (shared_ptr<PKL> i, pkls) {
297                 pkl_hash = i->hash (reel_mxf->asset_ref()->id());
298                 if (pkl_hash) {
299                         break;
300                 }
301         }
302
303         DCP_ASSERT (pkl_hash);
304
305         optional<string> cpl_hash = reel_mxf->hash();
306         if (cpl_hash && *cpl_hash != *pkl_hash) {
307                 return RESULT_CPL_PKL_DIFFER;
308         }
309
310         if (actual_hash != *pkl_hash) {
311                 return RESULT_BAD;
312         }
313
314         return RESULT_GOOD;
315 }
316
317
318 list<VerificationNote>
319 dcp::verify (
320         vector<boost::filesystem::path> directories,
321         function<void (string, optional<boost::filesystem::path>)> stage,
322         function<void (float)> progress,
323         boost::filesystem::path xsd_dtd_directory
324         )
325 {
326         xsd_dtd_directory = boost::filesystem::canonical (xsd_dtd_directory);
327
328         list<VerificationNote> notes;
329
330         list<shared_ptr<DCP> > dcps;
331         BOOST_FOREACH (boost::filesystem::path i, directories) {
332                 dcps.push_back (shared_ptr<DCP> (new DCP (i)));
333         }
334
335         BOOST_FOREACH (shared_ptr<DCP> dcp, dcps) {
336                 stage ("Checking DCP", dcp->directory());
337                 try {
338                         dcp->read (&notes);
339                 } catch (DCPReadError& e) {
340                         notes.push_back (VerificationNote(VerificationNote::VERIFY_ERROR, VerificationNote::Code::GENERAL_READ, string(e.what())));
341                 } catch (XMLError& e) {
342                         notes.push_back (VerificationNote(VerificationNote::VERIFY_ERROR, VerificationNote::Code::GENERAL_READ, string(e.what())));
343                 }
344
345                 BOOST_FOREACH (shared_ptr<CPL> cpl, dcp->cpls()) {
346                         stage ("Checking CPL", cpl->file());
347
348                         list<XMLValidationError> errors = validate_xml (cpl->file().get(), xsd_dtd_directory);
349                         BOOST_FOREACH (XMLValidationError i, errors) {
350                                 notes.push_back (VerificationNote(
351                                                          VerificationNote::VERIFY_ERROR, VerificationNote::Code::XML_VALIDATION_ERROR,
352                                                          String::compose("%1 (on line %2)", i.message(), i.line())
353                                                          ));
354                         }
355
356                         /* Check that the CPL's hash corresponds to the PKL */
357                         BOOST_FOREACH (shared_ptr<PKL> i, dcp->pkls()) {
358                                 optional<string> h = i->hash(cpl->id());
359                                 if (h && make_digest(Data(*cpl->file())) != *h) {
360                                         notes.push_back (VerificationNote(VerificationNote::VERIFY_ERROR, VerificationNote::CPL_HASH_INCORRECT));
361                                 }
362                         }
363
364                         BOOST_FOREACH (shared_ptr<Reel> reel, cpl->reels()) {
365                                 stage ("Checking reel", optional<boost::filesystem::path>());
366                                 if (reel->main_picture()) {
367                                         /* Check reel stuff */
368                                         Fraction const frame_rate = reel->main_picture()->frame_rate();
369                                         if (frame_rate.denominator != 1 ||
370                                             (frame_rate.numerator != 24 &&
371                                              frame_rate.numerator != 25 &&
372                                              frame_rate.numerator != 30 &&
373                                              frame_rate.numerator != 48 &&
374                                              frame_rate.numerator != 50 &&
375                                              frame_rate.numerator != 60 &&
376                                              frame_rate.numerator != 96)) {
377                                                 notes.push_back (VerificationNote(VerificationNote::VERIFY_ERROR, VerificationNote::INVALID_PICTURE_FRAME_RATE));
378                                         }
379                                         /* Check asset */
380                                         if (reel->main_picture()->asset_ref().resolved()) {
381                                                 stage ("Checking picture asset hash", reel->main_picture()->asset()->file());
382                                                 Result const r = verify_asset (dcp, reel->main_picture(), progress);
383                                                 switch (r) {
384                                                 case RESULT_BAD:
385                                                         notes.push_back (
386                                                                         VerificationNote(
387                                                                                 VerificationNote::VERIFY_ERROR, VerificationNote::PICTURE_HASH_INCORRECT, *reel->main_picture()->asset()->file()
388                                                                                 )
389                                                                         );
390                                                         break;
391                                                 case RESULT_CPL_PKL_DIFFER:
392                                                         notes.push_back (VerificationNote(VerificationNote::VERIFY_ERROR, VerificationNote::PKL_CPL_PICTURE_HASHES_DISAGREE));
393                                                         break;
394                                                 default:
395                                                         break;
396                                                 }
397                                         }
398                                 }
399                                 if (reel->main_sound() && reel->main_sound()->asset_ref().resolved()) {
400                                         stage ("Checking sound asset hash", reel->main_sound()->asset()->file());
401                                         Result const r = verify_asset (dcp, reel->main_sound(), progress);
402                                         switch (r) {
403                                         case RESULT_BAD:
404                                                 notes.push_back (
405                                                                 VerificationNote(
406                                                                         VerificationNote::VERIFY_ERROR, VerificationNote::SOUND_HASH_INCORRECT, *reel->main_sound()->asset()->file()
407                                                                         )
408                                                                 );
409                                                 break;
410                                         case RESULT_CPL_PKL_DIFFER:
411                                                 notes.push_back (VerificationNote (VerificationNote::VERIFY_ERROR, VerificationNote::PKL_CPL_SOUND_HASHES_DISAGREE));
412                                                 break;
413                                         default:
414                                                 break;
415                                         }
416                                 }
417                         }
418                 }
419         }
420
421         return notes;
422 }
423
424 string
425 dcp::note_to_string (dcp::VerificationNote note)
426 {
427         switch (note.code()) {
428         case dcp::VerificationNote::GENERAL_READ:
429                 return *note.note();
430         case dcp::VerificationNote::CPL_HASH_INCORRECT:
431                 return "The hash of the CPL in the PKL does not agree with the CPL file";
432         case dcp::VerificationNote::INVALID_PICTURE_FRAME_RATE:
433                 return "The picture in a reel has an invalid frame rate";
434         case dcp::VerificationNote::PICTURE_HASH_INCORRECT:
435                 return dcp::String::compose("The hash of the picture asset %1 does not agree with the PKL file", note.file()->filename());
436         case dcp::VerificationNote::PKL_CPL_PICTURE_HASHES_DISAGREE:
437                 return "The PKL and CPL hashes disagree for a picture asset.";
438         case dcp::VerificationNote::SOUND_HASH_INCORRECT:
439                 return dcp::String::compose("The hash of the sound asset %1 does not agree with the PKL file", note.file()->filename());
440         case dcp::VerificationNote::PKL_CPL_SOUND_HASHES_DISAGREE:
441                 return "The PKL and CPL hashes disagree for a sound asset.";
442         case dcp::VerificationNote::EMPTY_ASSET_PATH:
443                 return "The asset map contains an empty asset path.";
444         case dcp::VerificationNote::MISSING_ASSET:
445                 return "The file for an asset in the asset map cannot be found.";
446         case dcp::VerificationNote::MISMATCHED_STANDARD:
447                 return "The DCP contains both SMPTE and Interop parts.";
448         case dcp::VerificationNote::XML_VALIDATION_ERROR:
449                 return "An XML file is badly formed.";
450         }
451
452         return "";
453 }