summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar Berthold Stoeger <bstoeger@mail.tuwien.ac.at>2018-09-15 19:11:01 +0200
committerGravatar Dirk Hohndel <dirk@hohndel.org>2018-09-25 14:59:14 -0700
commitcc4f48be3f49ffef51c7c0cb328de709a05c4a7d (patch)
tree9b0f1671a2b3f0d1d7a4449f21531625d2514237
parent0aab39b35dd034a558282ca8ee53e9dfe838e0b9 (diff)
downloadsubsurface-cc4f48be3f49ffef51c7c0cb328de709a05c4a7d.tar.gz
Metadata: rudimentary support for XMP metadataa in MP4-based videos
XMP is a media-metadata standard based on XML which may be used across a variety of media formats. Some video-processing software writes XMP data without updating the native metadata fields. Therefore, we should aim at reading XMP metadata and give priority of XMP data over native fields. Pros: - Support for *all* common media formats. Cons: - XML (complex, verbose, chaotic). - Does not even come close to fulfilling its promise of being well defined (see below). Implement a simple XMP-parser using libxml2. Connect the XMP-parser to the existing Quicktime/MP4 parser. First problem encountered: According to the spec, XMP data supposed to be put in the 'XMP_' atom. But for example exiftools instead writes an 'uuid' atom with a special 16-byte uid. Implement both, more options will probably follow. Second problem: two versions of recording the creation date were found 1) The content of a <exif:DateTimeOriginal> tag. 2) The xmp::CreateDate attribute of a <rdf:Description> tag. Here too, more versions are expected to surface and will have to be supported in due course (with an obvious priority problem). Signed-off-by: Berthold Stoeger <bstoeger@mail.tuwien.ac.at>
-rw-r--r--core/CMakeLists.txt1
-rw-r--r--core/metadata.cpp39
-rw-r--r--core/xmp_parser.cpp138
-rw-r--r--core/xmp_parser.h11
4 files changed, 185 insertions, 4 deletions
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 310ba6659..dec719929 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -69,6 +69,7 @@ set(SUBSURFACE_CORE_LIB_SRCS
load-git.c
membuffer.c
metadata.cpp
+ xmp_parser.cpp
metrics.cpp
ostctools.c
parse-xml.c
diff --git a/core/metadata.cpp b/core/metadata.cpp
index 2457c69c0..b549a5b3b 100644
--- a/core/metadata.cpp
+++ b/core/metadata.cpp
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "metadata.h"
+#include "xmp_parser.h"
#include "exif.h"
#include "qthelper.h"
#include <QString>
@@ -111,6 +112,15 @@ static bool parseExif(QFile &f, struct metadata *metadata)
}
}
+// Parse an embedded XMP block. Note that this is likely generated by
+// external tools and therefore we give priority of XMP data over
+// native metadata.
+static void parseXMP(const char *data, size_t size, metadata *metadata)
+{
+ if (timestamp_t timestamp = parse_xmp(data, size))
+ metadata->timestamp = timestamp;
+}
+
static bool parseMP4(QFile &f, metadata *metadata)
{
f.seek(0);
@@ -170,8 +180,9 @@ static bool parseMP4(QFile &f, metadata *metadata)
if (!memcmp(type, "moov", 4) ||
!memcmp(type, "trak", 4) ||
- !memcmp(type, "mdia", 4)) {
- // Recurse into "moov", "trak" and "mdia" atoms
+ !memcmp(type, "mdia", 4) ||
+ !memcmp(type, "udta", 4)) {
+ // Recurse into "moov", "trak", "mdia" and "udta" atoms
atom_stack.push_back(atom_size);
continue;
} else if (!memcmp(type, "mdhd", 4) && atom_size >= 24 && atom_size < 4096) {
@@ -203,10 +214,30 @@ static bool parseMP4(QFile &f, metadata *metadata)
metadata->duration.seconds = lrint((double)duration / timescale);
// Timestamp is given as seconds since midnight 1904/1/1. To be convertible to the UNIX epoch
// it must be larger than 2082844800.
- if (timestamp >= 2082844800) {
+ // Note that we only set timestamp if not already set, because we give priority to XMP data.
+ if (!metadata->timestamp && timestamp >= 2082844800) {
metadata->timestamp = timestamp - 2082844800;
- // Currently, we only know how to extract timestamps, so we might just quit parsing here.
+ // We got our timestamp and duration. Nevertheless, we continue
+ // parsing, as there might still be an XMP atom.
+ }
+ } else if (!memcmp(type, "XMP_", 4) && atom_size > 32 && atom_size < 100000) {
+ // Parse embedded XMP data.
+ std::vector<char> d(atom_size);
+ if (f.read(&d[0], atom_size) != static_cast<int>(atom_size))
break;
+
+ parseXMP(&d[0], atom_size, metadata);
+ } else if (!memcmp(type, "uuid", 4) && atom_size > 32 && atom_size < 100000) {
+ // UUID atoms with uid "BE7ACFCB97A942E89C71999491E3AFAC" contain XMP blocks
+ // according the JPEG 2000 standard. exiftools produces mp4-style videos with such
+ // an UUID atom.
+ std::vector<char> d(atom_size);
+ if (f.read(&d[0], atom_size) != static_cast<int>(atom_size))
+ break;
+
+ static const char xmp_uid[17] = "\xBE\x7A\xCF\xCB\x97\xA9\x42\xE8\x9C\x71\x99\x94\x91\xE3\xAF\xAC";
+ if (!memcmp(&d[0], xmp_uid, 16)) {
+ parseXMP(&d[16], atom_size - 16, metadata);
}
} else {
// Jump over unknown atom
diff --git a/core/xmp_parser.cpp b/core/xmp_parser.cpp
new file mode 100644
index 000000000..0725030fb
--- /dev/null
+++ b/core/xmp_parser.cpp
@@ -0,0 +1,138 @@
+#include "xmp_parser.h"
+#include "subsurface-string.h"
+
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+#include <cctype>
+
+extern "C" timestamp_t utc_mktime(struct tm *tm); // declared in core/dive.h
+
+static timestamp_t parse_xmp_date(const char *date)
+{
+ // Format: "yyyy-mm-dd[Thh:mm[:ss[.ms]][-05:00]]"
+ int year, month, day;
+ if (sscanf(date, "%d-%d-%d", &year, &month, &day) != 3)
+ return 0;
+
+ int hours = 0, minutes = 0, seconds = 0, milliseconds = 0;
+ int timezone = 0;
+
+ // Check for time part
+ if ((date = strchr(date, 'T')) != nullptr) {
+ ++date; // Skip 'T'
+ if (sscanf(date, "%d:%d:%d.%d", &hours, &minutes, &seconds, &milliseconds) < 2)
+ return 0;
+
+ // Check for timezone part. Note that we simply ignore 'Z' as that
+ // means no time zone
+ while (*date && *date != '+' && *date != '-')
+ ++date;
+ if (*date) {
+ int sign = *date == '+' ? 1 : -1;
+ int timezone_hours, timezone_minutes;
+ ++date;
+ if (sscanf(date, "%d:%d", &timezone_hours, &timezone_minutes) != 2)
+ return 0;
+ timezone = sign * (timezone_hours * 60 + timezone_minutes) * 60;
+ }
+ }
+
+ // Round to seconds, since our timestamps are in seconds
+ if (milliseconds >= 500)
+ seconds += 1;
+
+ struct tm tm = { 0 };
+ tm.tm_year = year - 1900;
+ tm.tm_mon = month - 1;
+ tm.tm_mday = day;
+ tm.tm_hour = hours;
+ tm.tm_min = minutes;
+ tm.tm_sec = seconds;
+
+ timestamp_t res = utc_mktime(&tm);
+ res += timezone;
+
+ return res;
+}
+
+static timestamp_t extract_timestamp_from_attributes(const xmlNode *node)
+{
+ for (const xmlAttr *p = node->properties; p; p = p->next) {
+ const xmlChar *ns = p->ns ? p->ns->prefix : nullptr;
+
+ // Check for xmp::CreateDate property
+ if (!strcmp((const char *)ns, "xmp") && !strcmp((const char *)p->name, "CreateDate")) {
+ // We only support a single property value
+ if (!p->children || !p->children->content)
+ return 0;
+ const char *date = (const char *)p->children->content;
+ return parse_xmp_date(date);
+ }
+ }
+ return 0;
+}
+
+static timestamp_t extract_timestamp(const xmlNode *node)
+{
+ // We use a private stack, so that we can return in one go without
+ // having to unwind the call-stack. We only recurse to a fixed depth,
+ // since the data we are interested in are at a shallow depth.
+ // This can be increased on demand.
+ static const int max_recursion_depth = 16;
+ const xmlNode *stack[max_recursion_depth];
+ stack[0] = node;
+ int stack_depth = 1;
+
+ while (stack_depth > 0) {
+ const xmlNode *node = stack[stack_depth - 1];
+ // Parse attributes
+ timestamp_t timestamp = extract_timestamp_from_attributes(node);
+ if (timestamp)
+ return timestamp;
+
+ // Parse content, if not blank node. Content can only be at the second level,
+ // since it is always contained in a tag.
+ // TODO: We have to cast node to pointer to non-const, since we're supporting
+ // old libxml2 versions, where xmlIsBlankNode takes such a pointer. Remove
+ // in due course.
+ if (!xmlIsBlankNode((xmlNode *)node) && stack_depth >= 2) {
+ const xmlNode *parent = stack[stack_depth - 2];
+ // If this is a text node and the parent node is exif:DateTimeOriginal, try to parse as date
+ if (!node->ns && parent->ns &&
+ same_string((const char *)parent->ns->prefix, "exif") &&
+ same_string((const char *)parent->name, "DateTimeOriginal")) {
+ const char *date = (const char *)node->content;
+ timestamp_t res = parse_xmp_date(date);
+ if(res)
+ return res;
+ }
+ }
+
+ // If there are sub-items and we haven't reached recursion depth, recurse
+ if (node->children && stack_depth < max_recursion_depth) {
+ stack[stack_depth++] = node->children;
+ continue;
+ }
+
+ // Advance stack to next node in this level
+ while (stack_depth > 0) {
+ if ((stack[stack_depth - 1] = stack[stack_depth - 1]->next) != nullptr)
+ break;
+ // No more nodes at this level -> go up a level.
+ --stack_depth;
+ }
+ }
+ return 0;
+}
+
+timestamp_t parse_xmp(const char *data, size_t size)
+{
+ const char *encoding = xmlGetCharEncodingName(XML_CHAR_ENCODING_UTF8);
+ // TODO: What do we pass as URL-parameter?
+ xmlDoc *doc = xmlReadMemory(data, size, "url", encoding, 0);
+ if (!doc)
+ return 0;
+ timestamp_t res = extract_timestamp(xmlDocGetRootElement(doc));
+ xmlFreeDoc(doc);
+ return res;
+}
diff --git a/core/xmp_parser.h b/core/xmp_parser.h
new file mode 100644
index 000000000..8bdcb6de4
--- /dev/null
+++ b/core/xmp_parser.h
@@ -0,0 +1,11 @@
+// Parse XMP blocks using libxml2
+
+#ifndef XMP_PARSER_H
+#define XMP_PARSER_H
+
+#include "units.h" // for timestamp_t
+#include <stddef.h> // for size_t
+
+timestamp_t parse_xmp(const char *data, size_t size); // On failure returns 0.
+
+#endif // XMP_PARSER_H