aboutsummaryrefslogtreecommitdiffstats
path: root/parse-xml.c
diff options
context:
space:
mode:
authorGravatar Miika Turkia <miika.turkia@gmail.com>2013-03-15 19:02:14 +0200
committerGravatar Dirk Hohndel <dirk@hohndel.org>2013-03-15 16:29:37 -0700
commit757791335f212a189790452cb2d467c31a2ae672 (patch)
tree1a30448a7f69efd479c711f1999333f2502ee767 /parse-xml.c
parent98d769a02fc4f42c5afb20153847ab358ecc126f (diff)
downloadsubsurface-757791335f212a189790452cb2d467c31a2ae672.tar.gz
Support divelogs.de exports that include Cyrillic characters
divelogs.de sends us XML files that explicitly state that they are in ISO-8859-1 encoding (which is true). These files contain the HTML encoded Cyrillic characters. Once we decode those characters the resulting file is actually UTF-8 encoded (which is a superset of ISO-8859-1). That seriously confuses libxml when it tries to parse things. So instead recognize divelogs.de files and skip the encoding declaration for them before decoding the HTML encoded non-ISO-8859-1 characters. This does show, however, that divelogs.de incorrectly truncates the encoded strings (at least in some sample data that I created the parsing throws errors because of that). Reported-by: Sergey Starosek <sergey.starosek@gmail.com> Based-on-code-by: Miika Turkia <miika.turkia@gmail.com> Signed-off-by: Dirk Hohndel <dirk@hohndel.org>
Diffstat (limited to 'parse-xml.c')
-rw-r--r--parse-xml.c24
1 files changed, 23 insertions, 1 deletions
diff --git a/parse-xml.c b/parse-xml.c
index 4cdc3d8ab..b24806bba 100644
--- a/parse-xml.c
+++ b/parse-xml.c
@@ -8,6 +8,7 @@
#define __USE_XOPEN
#include <time.h>
#include <libxml/parser.h>
+#include <libxml/parserInternals.h>
#include <libxml/tree.h>
#ifdef XSLT
#include <libxslt/transform.h>
@@ -1533,13 +1534,34 @@ static void reset_all(void)
import_source = UNKNOWN;
}
+/* divelog.de sends us xml files that claim to be iso-8859-1
+ * but once we decode the HTML encoded characters they turn
+ * into UTF-8 instead. So skip the incorrect encoding
+ * declaration and decode the HTML encoded characters */
+const char *preprocess_divelog_de(const char *buffer)
+{
+ char *ret = strstr(buffer, "<DIVELOGSDATA>");
+
+ if (ret) {
+ xmlParserCtxtPtr ctx;
+ char buf[] = "";
+
+ ctx = xmlCreateMemoryParserCtxt(buf, sizeof(buf));
+ ret = xmlStringLenDecodeEntities(ctx, ret, strlen(ret), XML_SUBSTITUTE_REF, 0, 0, 0);
+
+ return ret;
+ }
+ return buffer;
+}
+
void parse_xml_buffer(const char *url, const char *buffer, int size,
struct dive_table *table, GError **error)
{
xmlDoc *doc;
+ const char *res = preprocess_divelog_de(buffer);
target_table = table;
- doc = xmlReadMemory(buffer, size, url, NULL, 0);
+ doc = xmlReadMemory(res, strlen(res), url, NULL, 0);
if (!doc) {
fprintf(stderr, _("Failed to parse '%s'.\n"), url);
parser_error(error, _("Failed to parse '%s'"), url);