diff options
author | Berthold Stoeger <bstoeger@mail.tuwien.ac.at> | 2020-02-16 22:19:44 +0100 |
---|---|---|
committer | Dirk Hohndel <dirk@hohndel.org> | 2020-03-01 10:21:44 -0800 |
commit | 6434ad262887a473abd410fd8011bba1291663d6 (patch) | |
tree | 6d0fab7c898c7daf0ab2fe53b2977d94cef60403 /core | |
parent | 41aae1aebe1aea4db58d61a171f7ecd76b6282d7 (diff) | |
download | subsurface-6434ad262887a473abd410fd8011bba1291663d6.tar.gz |
filter: add fulltext filtering code
Add code that indexes all words of a dive and provides searching
for words.
A query is represented by the FullTextQuery class, which can be
initialized by assigning a string to it. It is basically a list
of words.
The result of a search is stored in the FullTextResult class,
which is a list of dives.
The actual indexing and searching is implemented in the FullText
class. However, this class is not exported because the interface
is partially accessible to C. Notably, the reloading of the
fulltext index is done from the C core.
Currently, the indexing and searching is totally unoptimized.
In a ~4000 dives test-log searches typically took single-digit
ms times. There is ample room for optimization (e.g. when
searching for multiple words, chose the words with few dives
first and when down to a few dives, check them individually).
The words of each dive are tokenized and uppercased and
cached with the dive. A pointer to these words is stashed
in the dive structure.
For now, compile only on desktop.
Signed-off-by: Berthold Stoeger <bstoeger@mail.tuwien.ac.at>
Diffstat (limited to 'core')
-rw-r--r-- | core/CMakeLists.txt | 2 | ||||
-rw-r--r-- | core/dive.h | 3 | ||||
-rw-r--r-- | core/fulltext.cpp | 291 | ||||
-rw-r--r-- | core/fulltext.h | 69 |
4 files changed, 365 insertions, 0 deletions
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 24df1854b..d3f1401f1 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -88,6 +88,8 @@ set(SUBSURFACE_CORE_LIB_SRCS file.h format.cpp format.h + fulltext.cpp + fulltext.h gas.c gas.h gas-model.c diff --git a/core/dive.h b/core/dive.h index e602b4f4f..345eb5073 100644 --- a/core/dive.h +++ b/core/dive.h @@ -170,6 +170,9 @@ struct dive { bool notrip; /* Don't autogroup this dive to a trip */ bool selected; bool hidden_by_filter; +#if !defined(SUBSURFACE_MOBILE) + struct full_text_cache *full_text; /* word cache for full text search */ +#endif #if defined(SUBSURFACE_MOBILE) uint8_t collapsed; /* four values: 0 = don't show, 1 = show as dive, 2 = show corresponding trip, 3 = show dive and trip */ #endif diff --git a/core/fulltext.cpp b/core/fulltext.cpp new file mode 100644 index 000000000..5ed0c9a17 --- /dev/null +++ b/core/fulltext.cpp @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "fulltext.h" +#include "dive.h" +#include "divesite.h" +#include "trip.h" +#include <QLocale> +#include <map> + +#ifndef SUBSURFACE_MOBILE + +// This class caches each dives words, so that we can unregister a dive from the full text search +struct full_text_cache { + std::vector<QString> words; +}; + +// The FullText-search class +class FullText { + std::map<QString, std::vector<dive *>> words; // Dives that belong to each word +public: + + void reload(); // Rebuild from current dive_table + void registerDive(struct dive *d); // Note: can be called repeatedly + void unregisterDive(struct dive *d); // Note: can be called repeatedly + void unregisterAll(); // Unregister all dives in the dive table + FullTextResult find(const FullTextQuery &q, StringFilterMode mode) const; // Find dives matchin all words. +private: + void registerWords(struct dive *d, const std::vector<QString> &w); + void unregisterWords(struct dive *d, const std::vector<QString> &w); + std::vector<dive *> findDives(const QString &s, StringFilterMode mode) const; // Find dives matching a given word. +}; + +// This class doesn't depend on any other objects, we might just initialize it at startup. +static FullText self; + +// C-interface functions + +extern "C" { + +void fulltext_register(struct dive *d) +{ + self.registerDive(d); +} + +void fulltext_unregister(struct dive *d) +{ + self.unregisterDive(d); +} + +void fulltext_unregister_all() +{ + self.unregisterAll(); +} + +void fulltext_reload() +{ + self.reload(); +} + +} // extern "C" + +// C++-only interface functions +FullTextResult fulltext_find_dives(const FullTextQuery &q, StringFilterMode mode) +{ + return self.find(q, mode); +} + +// Check whether a single dive matches the fulltext criterion +bool fulltext_dive_matches(const struct dive *d, const FullTextQuery &q, StringFilterMode mode) +{ + if (!q.doit()) + return true; + if (!d->full_text) + return false; + auto matchFunc = + mode == StringFilterMode::EXACT ? [](const QString &s1, const QString &s2) { return s1 == s2; } : + mode == StringFilterMode::STARTSWITH ? [](const QString &s1, const QString &s2) { return s1.startsWith(s2); } : + /* mode == StringFilterMode::SUBSTRING ? */ [](const QString &s1, const QString &s2) { return s1.contains(s2); }; + const std::vector<QString> &words = d->full_text->words; + for (const QString &search: q.words) { + if (std::any_of(words.begin(), words.end(), [&search,matchFunc](const QString &w) { return matchFunc(w, search); })) + return true; + } + return false; +} + +// Class implementation + +// Take a text and tokenize it into words. Normalize the words to upper case +// and add to a given list, if not already in list. +// We might think about limiting the lower size of words we store. +// Note: we convert to QString before tokenization because we rely in +// Qt's isPunct() function. +static void tokenize(QString s, std::vector<QString> &res) +{ + if (s.isEmpty()) + return; + + QLocale loc; + int size = s.size(); + int pos = 0; + while (pos < size) { + // Skip whitespace and punctuation + while (s[pos].isSpace() || s[pos].isPunct()) { + if (++pos >= size) + return; + } + int end = pos; + while (end < size && !s[end].isSpace() && !s[end].isPunct()) + ++end; + QString word = loc.toUpper(s.mid(pos, end - pos)); // Sad: Locale::toUpper can't use QStringRef - we have to copy the substring! + pos = end; + + if (find(res.begin(), res.end(), word) == res.end()) + res.push_back(word); + } +} + +// Get all words of a dive +static std::vector<QString> getWords(const dive *d) +{ + std::vector<QString> res; + tokenize(QString(d->notes), res); + tokenize(QString(d->divemaster), res); + tokenize(QString(d->buddy), res); + tokenize(QString(d->suit), res); + for (int i = 0; i < d->cylinders.nr; ++i) { + const cylinder_t &cyl = d->cylinders.cylinders[i]; + tokenize(QString(cyl.type.description), res); + } + for (int i = 0; i < d->weightsystems.nr; ++i) { + const weightsystem_t &ws = d->weightsystems.weightsystems[i]; + tokenize(QString(ws.description), res); + } + // TODO: We should tokenize all dive-sites and trips first and then + // take the tokens from a cache. + if (d->dive_site) + tokenize(d->dive_site->name, res); + // TODO: We should index trips separately! + if (d->divetrip) + tokenize(d->divetrip->location, res); + return res; +} + +void FullText::reload() +{ + words.clear(); + int i; + dive *d; + for_each_dive(i, d) + registerDive(d); +} + +void FullText::registerDive(struct dive *d) +{ + if (d->full_text) { + unregisterWords(d, d->full_text->words); + } else { + d->full_text = new full_text_cache; + } + d->full_text->words = getWords(d); + registerWords(d, d->full_text->words); +} + +void FullText::unregisterDive(struct dive *d) +{ + if (!d->full_text) + return; + unregisterWords(d, d->full_text->words); + delete d->full_text; + d->full_text = nullptr; +} + +void FullText::unregisterAll() +{ + int i; + dive *d; + for_each_dive(i, d) { + delete d->full_text; + d->full_text = nullptr; + } + words.clear(); +} + +// Register words of a dive. +void FullText::registerWords(struct dive *d, const std::vector<QString> &w) +{ + for (const QString &word: w) { + std::vector<dive *> &entry = words[word]; + if (std::find(entry.begin(), entry.end(), d) == entry.end()) + entry.push_back(d); + } +} + +// Unregister words of a dive. +void FullText::unregisterWords(struct dive *d, const std::vector<QString> &w) +{ + for (const QString &word: w) { + auto it = words.find(word); + if (it == words.end()) { + qWarning("FullText::unregisterWords: didn't find word '%s' in index!?", qPrintable(word)); + continue; + } + std::vector<dive *> &entry = it->second; + entry.erase(std::remove(entry.begin(), entry.end(), d)); + if (entry.empty()) + words.erase(it); + } +} + +// Add dives from second array to first, if not yet there +void combineDives(std::vector<dive *> &to, const std::vector<dive *> &from) +{ + for (dive *d: from) { + if (std::find(to.begin(), to.end(), d) == to.end()) + to.push_back(d); + } +} + +std::vector<dive *> FullText::findDives(const QString &s, StringFilterMode mode) const +{ + switch (mode) { + case StringFilterMode::EXACT: + default: { + // Try to access a single word + auto it = words.find(s); + if (it == words.end()) + return {}; + return it->second; + } + case StringFilterMode::STARTSWITH: { + // Find all words that start with a substring. We use the fact + // that these words must form a contiguous block, since the words are + // ordered lexicographically. + auto it = words.lower_bound(s); + if (it == words.end() || !it->first.startsWith(s)) + return {}; + std::vector<dive *> res = it->second; + ++it; + while (it != words.end() && it->first.startsWith(s)) { + combineDives(res, it->second); + ++it; + } + return res; + } + case StringFilterMode::SUBSTRING: { + // Find all words that contain a substring. Here, we have to check all words! + std::vector<dive *> res; + for (auto it = words.begin(); it != words.end(); ++it) { + if (it->first.contains(s)) + combineDives(res, it->second); + } + return res; + } + } +} + +FullTextResult FullText::find(const FullTextQuery &q, StringFilterMode mode) const +{ + if (q.words.empty()) + return FullTextResult(); + + std::vector<dive *> res = findDives(q.words[0], mode); + for (size_t i = 1; i < q.words.size(); ++i) { + std::vector<dive *> res2 = findDives(q.words[i], mode); + // Remove dives from res that are not in res2 + res.erase(std::remove_if(res.begin(), res.end(), + [&res2] (dive *d) { return std::find(res2.begin(), res2.end(), d) == res2.end(); }), res.end()); + } + + return { res }; +} + +FullTextQuery &FullTextQuery::operator=(const QString &s) +{ + words.clear(); + tokenize(s, words); + return *this; +} + +bool FullTextQuery::doit() const +{ + return !words.empty(); +} + +bool FullTextResult::dive_matches(const struct dive *d) const +{ + return std::find(dives.begin(), dives.end(), d) != dives.end(); +} + +#endif diff --git a/core/fulltext.h b/core/fulltext.h new file mode 100644 index 000000000..c22a118b8 --- /dev/null +++ b/core/fulltext.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +// A class for performing full text searches on dives. +// Search is case-insensitive. Even though QString has many design +// issues such as COW semantics and UTF-16 encoding, it provides +// platform independence and reasonable performance. Therefore, +// this is based in QString instead of std::string. +// +// To make this accessible from C, this does manual memory management: +// Every dive is associated with a cache of words. Thus, when deleting +// a dive, a function freeing that data has to be called. + +#ifndef FULLTEXT_H +#define FULLTEXT_H + +// For now only compile on desktop +#ifndef SUBSURFACE_MOBILE + +// 1) The C-accessible interface + +#ifdef __cplusplus +extern "C" { +#endif + +struct full_text_cache; +struct dive; +void fulltext_register(struct dive *d); // Note: can be called repeatedly +void fulltext_unregister(struct dive *d); // Note: can be called repeatedly +void fulltext_unregister_all(); // Unregisters all dives in the dive table +void fulltext_reload(); // Registers all dives in the dive table + +#ifdef __cplusplus +} +#endif + +// 2) The C++-only interface +#ifdef __cplusplus + +#include <QString> +#include <vector> + +enum class StringFilterMode { + SUBSTRING = 0, + STARTSWITH = 1, + EXACT = 2 +}; + +// A fulltext query. Basically a list of normalized words we search for +struct FullTextQuery { + std::vector<QString> words; + FullTextQuery &operator=(const QString &); // Initialize by assigning a user-provided search string + bool doit() const; // true if we should to a fulltext search +}; + +// Describes the result of a fulltext search +struct FullTextResult { + std::vector<dive *> dives; + bool dive_matches(const struct dive *d) const; +}; + +// Two search modes: +// 1) Find all dives matching the query. +// 2) Test if a given dive matches the query. +FullTextResult fulltext_find_dives(const FullTextQuery &q, StringFilterMode); +bool fulltext_dive_matches(const struct dive *d, const FullTextQuery &q, StringFilterMode); + +#endif +#endif // SUBSURFACE_MOBILE +#endif |