aboutsummaryrefslogtreecommitdiffstats
path: root/core/fulltext.cpp
blob: 2e27d8524baf0da74dab083ba5f3ee9daeeca9e7 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
// SPDX-License-Identifier: GPL-2.0

#include "fulltext.h"
#include "dive.h"
#include "divesite.h"
#include "tag.h"
#include "trip.h"
#include "qthelper.h"
#include <QLocale>
#include <map>

// This class caches each dives words, so that we can unregister a dive from the full text search
struct full_text_cache {
	std::vector<QString> words;
};

// The FullText-search class
class FullText {
	std::map<QString, std::vector<dive *>> words; // Dives that belong to each word
public:

	void populate(); // Rebuild from current dive_table
	void registerDive(struct dive *d); // Note: can be called repeatedly
	void unregisterDive(struct dive *d); // Note: can be called repeatedly
	void unregisterAll(); // Unregister all dives in the dive table
	FullTextResult find(const FullTextQuery &q, StringFilterMode mode) const; // Find dives matchin all words.
private:
	void registerWords(struct dive *d, const std::vector<QString> &w);
	void unregisterWords(struct dive *d, const std::vector<QString> &w);
	std::vector<dive *> findDives(const QString &s, StringFilterMode mode) const; // Find dives matching a given word.
};

// This class doesn't depend on any other objects, we might just initialize it at startup.
static FullText self;

// C-interface functions

extern "C" {

void fulltext_register(struct dive *d)
{
	self.registerDive(d);
}

void fulltext_unregister(struct dive *d)
{
	self.unregisterDive(d);
}

void fulltext_unregister_all()
{
	self.unregisterAll();
}

void fulltext_populate()
{
	self.populate();
}

} // extern "C"

// C++-only interface functions
FullTextResult fulltext_find_dives(const FullTextQuery &q, StringFilterMode mode)
{
	return self.find(q, mode);
}

// Check whether a single dive matches the fulltext criterion
bool fulltext_dive_matches(const struct dive *d, const FullTextQuery &q, StringFilterMode mode)
{
	if (!q.doit())
		return true;
	if (!d->full_text)
		return false;
	auto matchFunc =
		mode == StringFilterMode::EXACT ? [](const QString &s1, const QString &s2) { return s1 == s2; } :
		mode == StringFilterMode::STARTSWITH ? [](const QString &s1, const QString &s2) { return s1.startsWith(s2); } :
		/* mode == StringFilterMode::SUBSTRING ? */ [](const QString &s1, const QString &s2) { return s1.contains(s2); };
	const std::vector<QString> &words = d->full_text->words;
	for (const QString &search: q.words) {
		if (std::any_of(words.begin(), words.end(), [&search,matchFunc](const QString &w) { return matchFunc(w, search); }))
			return true;
	}
	return false;
}

// Class implementation

// Take a text and tokenize it into words. Normalize the words to upper case
// and add to a given list, if not already in list.
// We might think about limiting the lower size of words we store.
// Note: we convert to QString before tokenization because we rely in
// Qt's isPunct() function.
static void tokenize(QString s, std::vector<QString> &res)
{
	if (s.isEmpty())
		return;

	QLocale loc;
	int size = s.size();
	int pos = 0;
	while (pos < size) {
		// Skip whitespace and punctuation
		while (s[pos].isSpace() || s[pos].isPunct()) {
			if (++pos >= size)
				return;
		}
		int end = pos;
		while (end < size && !s[end].isSpace() && !s[end].isPunct())
			++end;
		QString word = loc.toUpper(s.mid(pos, end - pos)); // Sad: Locale::toUpper can't use QStringRef - we have to copy the substring!
		pos = end;

		if (find(res.begin(), res.end(), word) == res.end())
			res.push_back(word);
	}
}

// Get all words of a dive
static std::vector<QString> getWords(const dive *d)
{
	std::vector<QString> res;
	tokenize(QString(d->notes), res);
	tokenize(QString(d->divemaster), res);
	tokenize(QString(d->buddy), res);
	tokenize(QString(d->suit), res);
	for (const tag_entry *tag = d->tag_list; tag; tag = tag->next)
		tokenize(QString(tag->tag->name), res);
	for (int i = 0; i < d->cylinders.nr; ++i) {
		const cylinder_t &cyl = d->cylinders.cylinders[i];
		tokenize(QString(cyl.type.description), res);
	}
	for (int i = 0; i < d->weightsystems.nr; ++i) {
		const weightsystem_t &ws = d->weightsystems.weightsystems[i];
		tokenize(QString(ws.description), res);
	}
	// TODO: We should tokenize all dive-sites and trips first and then
	// take the tokens from a cache.
	if (d->dive_site)
		tokenize(d->dive_site->name, res);
	// TODO: We should index trips separately!
	if (d->divetrip)
		tokenize(d->divetrip->location, res);
	return res;
}

void FullText::populate()
{
	// we want this to be two calls as the second text is overwritten below by the lines starting with "\r"
	uiNotification(QObject::tr("Create full text index"));
	uiNotification(QObject::tr("start processing"));
	int i;
	dive *d;
	for_each_dive(i, d)
		registerDive(d);
	uiNotification(QObject::tr("%1 dives processed").arg(dive_table.nr));
}

void FullText::registerDive(struct dive *d)
{
	if (d->full_text) {
		unregisterWords(d, d->full_text->words);
	} else {
		d->full_text = new full_text_cache;
	}
	d->full_text->words = getWords(d);
	registerWords(d, d->full_text->words);
}

void FullText::unregisterDive(struct dive *d)
{
	if (!d->full_text)
		return;
	unregisterWords(d, d->full_text->words);
	delete d->full_text;
	d->full_text = nullptr;
}

void FullText::unregisterAll()
{
	int i;
	dive *d;
	for_each_dive(i, d) {
		delete d->full_text;
		d->full_text = nullptr;
	}
	words.clear();
}

// Register words of a dive.
void FullText::registerWords(struct dive *d, const std::vector<QString> &w)
{
	for (const QString &word: w) {
		std::vector<dive *> &entry = words[word];
		if (std::find(entry.begin(), entry.end(), d) == entry.end())
			entry.push_back(d);
	}
}

// Unregister words of a dive.
void FullText::unregisterWords(struct dive *d, const std::vector<QString> &w)
{
	for (const QString &word: w) {
		auto it = words.find(word);
		if (it == words.end()) {
			qWarning("FullText::unregisterWords: didn't find word '%s' in index!?", qPrintable(word));
			continue;
		}
		std::vector<dive *> &entry = it->second;
		entry.erase(std::remove(entry.begin(), entry.end(), d));
		if (entry.empty())
			words.erase(it);
	}
}

// Add dives from second array to first, if not yet there
void combineDives(std::vector<dive *> &to, const std::vector<dive *> &from)
{
	for (dive *d: from) {
		if (std::find(to.begin(), to.end(), d) == to.end())
			to.push_back(d);
	}
}

std::vector<dive *> FullText::findDives(const QString &s, StringFilterMode mode) const
{
	switch (mode) {
	case StringFilterMode::EXACT:
	default: {
		// Try to access a single word
		auto it = words.find(s);
		if (it == words.end())
			return {};
		return it->second;
	}
	case StringFilterMode::STARTSWITH: {
		// Find all words that start with a substring. We use the fact
		// that these words must form a contiguous block, since the words are
		// ordered lexicographically.
		auto it = words.lower_bound(s);
		if (it == words.end() || !it->first.startsWith(s))
			return {};
		std::vector<dive *> res = it->second;
		++it;
		while (it != words.end() && it->first.startsWith(s)) {
			combineDives(res, it->second);
			++it;
		}
		return res;
	}
	case StringFilterMode::SUBSTRING: {
		// Find all words that contain a substring. Here, we have to check all words!
		std::vector<dive *> res;
		for (auto it = words.begin(); it != words.end(); ++it) {
			if (it->first.contains(s))
				combineDives(res, it->second);
		}
		return res;
	}
	}
}

FullTextResult FullText::find(const FullTextQuery &q, StringFilterMode mode) const
{
	if (q.words.empty())
		return FullTextResult();

	std::vector<dive *> res = findDives(q.words[0], mode);
	for (size_t i = 1; i < q.words.size(); ++i) {
		std::vector<dive *> res2 = findDives(q.words[i], mode);
		// Remove dives from res that are not in res2
		res.erase(std::remove_if(res.begin(), res.end(),
				[&res2] (dive *d) { return std::find(res2.begin(), res2.end(), d) == res2.end(); }), res.end());
	}

	return { res };
}

FullTextQuery &FullTextQuery::operator=(const QString &s)
{
	words.clear();
	tokenize(s, words);
	return *this;
}

bool FullTextQuery::doit() const
{
	return !words.empty();
}

bool FullTextResult::dive_matches(const struct dive *d) const
{
	return std::find(dives.begin(), dives.end(), d) != dives.end();
}