From 757791335f212a189790452cb2d467c31a2ae672 Mon Sep 17 00:00:00 2001 From: Miika Turkia Date: Fri, 15 Mar 2013 19:02:14 +0200 Subject: Support divelogs.de exports that include Cyrillic characters divelogs.de sends us XML files that explicitly state that they are in ISO-8859-1 encoding (which is true). These files contain the HTML encoded Cyrillic characters. Once we decode those characters the resulting file is actually UTF-8 encoded (which is a superset of ISO-8859-1). That seriously confuses libxml when it tries to parse things. So instead recognize divelogs.de files and skip the encoding declaration for them before decoding the HTML encoded non-ISO-8859-1 characters. This does show, however, that divelogs.de incorrectly truncates the encoded strings (at least in some sample data that I created the parsing throws errors because of that). Reported-by: Sergey Starosek Based-on-code-by: Miika Turkia Signed-off-by: Dirk Hohndel --- parse-xml.c | 24 +++++++++++++++++++++++- xslt/divelogs.xslt | 2 +- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/parse-xml.c b/parse-xml.c index 4cdc3d8ab..b24806bba 100644 --- a/parse-xml.c +++ b/parse-xml.c @@ -8,6 +8,7 @@ #define __USE_XOPEN #include #include +#include #include #ifdef XSLT #include @@ -1533,13 +1534,34 @@ static void reset_all(void) import_source = UNKNOWN; } +/* divelog.de sends us xml files that claim to be iso-8859-1 + * but once we decode the HTML encoded characters they turn + * into UTF-8 instead. So skip the incorrect encoding + * declaration and decode the HTML encoded characters */ +const char *preprocess_divelog_de(const char *buffer) +{ + char *ret = strstr(buffer, ""); + + if (ret) { + xmlParserCtxtPtr ctx; + char buf[] = ""; + + ctx = xmlCreateMemoryParserCtxt(buf, sizeof(buf)); + ret = xmlStringLenDecodeEntities(ctx, ret, strlen(ret), XML_SUBSTITUTE_REF, 0, 0, 0); + + return ret; + } + return buffer; +} + void parse_xml_buffer(const char *url, const char *buffer, int size, struct dive_table *table, GError **error) { xmlDoc *doc; + const char *res = preprocess_divelog_de(buffer); target_table = table; - doc = xmlReadMemory(buffer, size, url, NULL, 0); + doc = xmlReadMemory(res, strlen(res), url, NULL, 0); if (!doc) { fprintf(stderr, _("Failed to parse '%s'.\n"), url); parser_error(error, _("Failed to parse '%s'"), url); diff --git a/xslt/divelogs.xslt b/xslt/divelogs.xslt index f66ffccfe..c0585a540 100644 --- a/xslt/divelogs.xslt +++ b/xslt/divelogs.xslt @@ -1,7 +1,7 @@ - + -- cgit v1.2.3-70-g09d2