diff options
author | Berthold Stoeger <bstoeger@mail.tuwien.ac.at> | 2018-09-15 19:11:01 +0200 |
---|---|---|
committer | Dirk Hohndel <dirk@hohndel.org> | 2018-09-25 14:59:14 -0700 |
commit | cc4f48be3f49ffef51c7c0cb328de709a05c4a7d (patch) | |
tree | 9b0f1671a2b3f0d1d7a4449f21531625d2514237 | |
parent | 0aab39b35dd034a558282ca8ee53e9dfe838e0b9 (diff) | |
download | subsurface-cc4f48be3f49ffef51c7c0cb328de709a05c4a7d.tar.gz |
Metadata: rudimentary support for XMP metadataa in MP4-based videos
XMP is a media-metadata standard based on XML which may be used
across a variety of media formats. Some video-processing software
writes XMP data without updating the native metadata fields.
Therefore, we should aim at reading XMP metadata and give priority
of XMP data over native fields.
Pros:
- Support for *all* common media formats.
Cons:
- XML (complex, verbose, chaotic).
- Does not even come close to fulfilling its promise of being
well defined (see below).
Implement a simple XMP-parser using libxml2. Connect the XMP-parser to
the existing Quicktime/MP4 parser.
First problem encountered: According to the spec, XMP data supposed
to be put in the 'XMP_' atom. But for example exiftools instead
writes an 'uuid' atom with a special 16-byte uid. Implement both,
more options will probably follow.
Second problem: two versions of recording the creation date were found
1) The content of a <exif:DateTimeOriginal> tag.
2) The xmp::CreateDate attribute of a <rdf:Description> tag.
Here too, more versions are expected to surface and will have
to be supported in due course (with an obvious priority problem).
Signed-off-by: Berthold Stoeger <bstoeger@mail.tuwien.ac.at>
-rw-r--r-- | core/CMakeLists.txt | 1 | ||||
-rw-r--r-- | core/metadata.cpp | 39 | ||||
-rw-r--r-- | core/xmp_parser.cpp | 138 | ||||
-rw-r--r-- | core/xmp_parser.h | 11 |
4 files changed, 185 insertions, 4 deletions
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 310ba6659..dec719929 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -69,6 +69,7 @@ set(SUBSURFACE_CORE_LIB_SRCS load-git.c membuffer.c metadata.cpp + xmp_parser.cpp metrics.cpp ostctools.c parse-xml.c diff --git a/core/metadata.cpp b/core/metadata.cpp index 2457c69c0..b549a5b3b 100644 --- a/core/metadata.cpp +++ b/core/metadata.cpp @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "metadata.h" +#include "xmp_parser.h" #include "exif.h" #include "qthelper.h" #include <QString> @@ -111,6 +112,15 @@ static bool parseExif(QFile &f, struct metadata *metadata) } } +// Parse an embedded XMP block. Note that this is likely generated by +// external tools and therefore we give priority of XMP data over +// native metadata. +static void parseXMP(const char *data, size_t size, metadata *metadata) +{ + if (timestamp_t timestamp = parse_xmp(data, size)) + metadata->timestamp = timestamp; +} + static bool parseMP4(QFile &f, metadata *metadata) { f.seek(0); @@ -170,8 +180,9 @@ static bool parseMP4(QFile &f, metadata *metadata) if (!memcmp(type, "moov", 4) || !memcmp(type, "trak", 4) || - !memcmp(type, "mdia", 4)) { - // Recurse into "moov", "trak" and "mdia" atoms + !memcmp(type, "mdia", 4) || + !memcmp(type, "udta", 4)) { + // Recurse into "moov", "trak", "mdia" and "udta" atoms atom_stack.push_back(atom_size); continue; } else if (!memcmp(type, "mdhd", 4) && atom_size >= 24 && atom_size < 4096) { @@ -203,10 +214,30 @@ static bool parseMP4(QFile &f, metadata *metadata) metadata->duration.seconds = lrint((double)duration / timescale); // Timestamp is given as seconds since midnight 1904/1/1. To be convertible to the UNIX epoch // it must be larger than 2082844800. - if (timestamp >= 2082844800) { + // Note that we only set timestamp if not already set, because we give priority to XMP data. + if (!metadata->timestamp && timestamp >= 2082844800) { metadata->timestamp = timestamp - 2082844800; - // Currently, we only know how to extract timestamps, so we might just quit parsing here. + // We got our timestamp and duration. Nevertheless, we continue + // parsing, as there might still be an XMP atom. + } + } else if (!memcmp(type, "XMP_", 4) && atom_size > 32 && atom_size < 100000) { + // Parse embedded XMP data. + std::vector<char> d(atom_size); + if (f.read(&d[0], atom_size) != static_cast<int>(atom_size)) break; + + parseXMP(&d[0], atom_size, metadata); + } else if (!memcmp(type, "uuid", 4) && atom_size > 32 && atom_size < 100000) { + // UUID atoms with uid "BE7ACFCB97A942E89C71999491E3AFAC" contain XMP blocks + // according the JPEG 2000 standard. exiftools produces mp4-style videos with such + // an UUID atom. + std::vector<char> d(atom_size); + if (f.read(&d[0], atom_size) != static_cast<int>(atom_size)) + break; + + static const char xmp_uid[17] = "\xBE\x7A\xCF\xCB\x97\xA9\x42\xE8\x9C\x71\x99\x94\x91\xE3\xAF\xAC"; + if (!memcmp(&d[0], xmp_uid, 16)) { + parseXMP(&d[16], atom_size - 16, metadata); } } else { // Jump over unknown atom diff --git a/core/xmp_parser.cpp b/core/xmp_parser.cpp new file mode 100644 index 000000000..0725030fb --- /dev/null +++ b/core/xmp_parser.cpp @@ -0,0 +1,138 @@ +#include "xmp_parser.h" +#include "subsurface-string.h" + +#include <libxml/parser.h> +#include <libxml/tree.h> +#include <cctype> + +extern "C" timestamp_t utc_mktime(struct tm *tm); // declared in core/dive.h + +static timestamp_t parse_xmp_date(const char *date) +{ + // Format: "yyyy-mm-dd[Thh:mm[:ss[.ms]][-05:00]]" + int year, month, day; + if (sscanf(date, "%d-%d-%d", &year, &month, &day) != 3) + return 0; + + int hours = 0, minutes = 0, seconds = 0, milliseconds = 0; + int timezone = 0; + + // Check for time part + if ((date = strchr(date, 'T')) != nullptr) { + ++date; // Skip 'T' + if (sscanf(date, "%d:%d:%d.%d", &hours, &minutes, &seconds, &milliseconds) < 2) + return 0; + + // Check for timezone part. Note that we simply ignore 'Z' as that + // means no time zone + while (*date && *date != '+' && *date != '-') + ++date; + if (*date) { + int sign = *date == '+' ? 1 : -1; + int timezone_hours, timezone_minutes; + ++date; + if (sscanf(date, "%d:%d", &timezone_hours, &timezone_minutes) != 2) + return 0; + timezone = sign * (timezone_hours * 60 + timezone_minutes) * 60; + } + } + + // Round to seconds, since our timestamps are in seconds + if (milliseconds >= 500) + seconds += 1; + + struct tm tm = { 0 }; + tm.tm_year = year - 1900; + tm.tm_mon = month - 1; + tm.tm_mday = day; + tm.tm_hour = hours; + tm.tm_min = minutes; + tm.tm_sec = seconds; + + timestamp_t res = utc_mktime(&tm); + res += timezone; + + return res; +} + +static timestamp_t extract_timestamp_from_attributes(const xmlNode *node) +{ + for (const xmlAttr *p = node->properties; p; p = p->next) { + const xmlChar *ns = p->ns ? p->ns->prefix : nullptr; + + // Check for xmp::CreateDate property + if (!strcmp((const char *)ns, "xmp") && !strcmp((const char *)p->name, "CreateDate")) { + // We only support a single property value + if (!p->children || !p->children->content) + return 0; + const char *date = (const char *)p->children->content; + return parse_xmp_date(date); + } + } + return 0; +} + +static timestamp_t extract_timestamp(const xmlNode *node) +{ + // We use a private stack, so that we can return in one go without + // having to unwind the call-stack. We only recurse to a fixed depth, + // since the data we are interested in are at a shallow depth. + // This can be increased on demand. + static const int max_recursion_depth = 16; + const xmlNode *stack[max_recursion_depth]; + stack[0] = node; + int stack_depth = 1; + + while (stack_depth > 0) { + const xmlNode *node = stack[stack_depth - 1]; + // Parse attributes + timestamp_t timestamp = extract_timestamp_from_attributes(node); + if (timestamp) + return timestamp; + + // Parse content, if not blank node. Content can only be at the second level, + // since it is always contained in a tag. + // TODO: We have to cast node to pointer to non-const, since we're supporting + // old libxml2 versions, where xmlIsBlankNode takes such a pointer. Remove + // in due course. + if (!xmlIsBlankNode((xmlNode *)node) && stack_depth >= 2) { + const xmlNode *parent = stack[stack_depth - 2]; + // If this is a text node and the parent node is exif:DateTimeOriginal, try to parse as date + if (!node->ns && parent->ns && + same_string((const char *)parent->ns->prefix, "exif") && + same_string((const char *)parent->name, "DateTimeOriginal")) { + const char *date = (const char *)node->content; + timestamp_t res = parse_xmp_date(date); + if(res) + return res; + } + } + + // If there are sub-items and we haven't reached recursion depth, recurse + if (node->children && stack_depth < max_recursion_depth) { + stack[stack_depth++] = node->children; + continue; + } + + // Advance stack to next node in this level + while (stack_depth > 0) { + if ((stack[stack_depth - 1] = stack[stack_depth - 1]->next) != nullptr) + break; + // No more nodes at this level -> go up a level. + --stack_depth; + } + } + return 0; +} + +timestamp_t parse_xmp(const char *data, size_t size) +{ + const char *encoding = xmlGetCharEncodingName(XML_CHAR_ENCODING_UTF8); + // TODO: What do we pass as URL-parameter? + xmlDoc *doc = xmlReadMemory(data, size, "url", encoding, 0); + if (!doc) + return 0; + timestamp_t res = extract_timestamp(xmlDocGetRootElement(doc)); + xmlFreeDoc(doc); + return res; +} diff --git a/core/xmp_parser.h b/core/xmp_parser.h new file mode 100644 index 000000000..8bdcb6de4 --- /dev/null +++ b/core/xmp_parser.h @@ -0,0 +1,11 @@ +// Parse XMP blocks using libxml2 + +#ifndef XMP_PARSER_H +#define XMP_PARSER_H + +#include "units.h" // for timestamp_t +#include <stddef.h> // for size_t + +timestamp_t parse_xmp(const char *data, size_t size); // On failure returns 0. + +#endif // XMP_PARSER_H |