Merge lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length into lp:zeitgeist-extensions

Proposed by Mikkel Kamstrup Erlandsen
Status: Merged
Merged at revision: 73
Proposed branch: lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length
Merge into: lp:zeitgeist-extensions
Diff against target: 114 lines (+41/-13)
2 files modified
fts/_tests.py (+1/-0)
fts/fts.py (+40/-13)
To merge this branch: bzr merge lp:~kamstrup/zeitgeist-extensions/fts-cap-term-length
Reviewer Review Type Date Requested Status
Zeitgeist Extensions Pending
Review via email: mp+74362@code.launchpad.net

Description of the change

See attached bug

To post a comment you must log in.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'fts/_tests.py'
2--- fts/_tests.py 2011-09-01 13:46:30 +0000
3+++ fts/_tests.py 2011-09-07 08:45:23 +0000
4@@ -104,3 +104,4 @@
5 assert u"漢字" in results[0].subjects[0].text, results[0].subjects[0].uri
6
7
8+
9
10=== modified file 'fts/fts.py'
11--- fts/fts.py 2011-09-06 10:03:23 +0000
12+++ fts/fts.py 2011-09-07 08:45:23 +0000
13@@ -50,6 +50,7 @@
14 import threading
15 from urllib import quote as url_escape, unquote as url_unescape
16 import gobject, gio
17+from cStringIO import StringIO
18
19 from zeitgeist.datamodel import Symbol, StorageState, ResultType, TimeRange, NULL_EVENT, NEGATION_OPERATOR
20 from _zeitgeist.engine.datamodel import Event, Subject
21@@ -93,6 +94,10 @@
22 ResultType.LeastPopularActor,
23 ]
24
25+# Xapian has a maximum term length of 245 bytes and Bad Things(TM) happen
26+# if you bust that. We use the cap_string() function to control this.
27+MAX_TERM_LENGTH = 245
28+
29 def synchronized(lock):
30 """ Synchronization decorator. """
31
32@@ -197,6 +202,31 @@
33 result += c
34 return result
35
36+def cap_string (s, nbytes=MAX_TERM_LENGTH):
37+ """
38+ If s has more than nbytes bytes (not characters) then cap it off
39+ after nbytes bytes in a way still producing a valid utf-8 string.
40+
41+ Assumes that s is a utf-8 string.
42+
43+ This function useful for working with Xapian terms because Xapian has
44+ a max term length of 245 (which is not very well documented, but see
45+ http://xapian.org/docs/omega/termprefixes.html).
46+ """
47+ # Check if we can fast-path this string
48+ if (len(s.encode("utf-8")) <= nbytes):
49+ return s
50+
51+ # We use a StringIO here to avoid mem thrashing via naiive
52+ # string concatenation. See fx. http://www.skymind.com/~ocrow/python_string/
53+ buf = StringIO()
54+ for char in s :
55+ if buf.tell() >= nbytes - 1 :
56+ return buf.getvalue()
57+ buf.write(char.encode("utf-8"))
58+
59+ return unicode(buf.getvalue().decode("utf-8"))
60+
61 def expand_type (type_prefix, uri):
62 """
63 Return a string with a Xapian query matching all child types of 'uri'
64@@ -564,7 +594,7 @@
65
66 doc = self._tokenizer.get_document()
67 for cat in desktop.getCategories():
68- doc.add_boolean_term(FILTER_PREFIX_XDG_CATEGORY+cat.lower())
69+ doc.add_boolean_term(cap_string(FILTER_PREFIX_XDG_CATEGORY+cat.lower()))
70 else:
71 log.debug("Unable to look up app info for %s" % actor)
72
73@@ -649,25 +679,25 @@
74 """Adds the filtering rules to the doc. Filtering rules will
75 not affect the relevancy ranking of the event/doc"""
76 if event.interpretation:
77- doc.add_boolean_term (FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation)
78+ doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_INTERPRETATION+event.interpretation))
79 if event.manifestation:
80- doc.add_boolean_term (FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation)
81+ doc.add_boolean_term (cap_string(FILTER_PREFIX_EVENT_MANIFESTATION+event.manifestation))
82 if event.actor:
83- doc.add_boolean_term (FILTER_PREFIX_ACTOR+mangle_uri(event.actor))
84+ doc.add_boolean_term (cap_string(FILTER_PREFIX_ACTOR+mangle_uri(event.actor)))
85
86 for su in event.subjects:
87 if su.uri:
88- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri))
89+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_URI+mangle_uri(su.uri)))
90 if su.interpretation:
91- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation)
92+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_INTERPRETATION+su.interpretation))
93 if su.manifestation:
94- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation)
95+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MANIFESTATION+su.manifestation))
96 if su.origin:
97- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin))
98+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_ORIGIN+mangle_uri(su.origin)))
99 if su.mimetype:
100- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype)
101+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_MIMETYPE+su.mimetype))
102 if su.storage:
103- doc.add_boolean_term (FILTER_PREFIX_SUBJECT_STORAGE+su.storage)
104+ doc.add_boolean_term (cap_string(FILTER_PREFIX_SUBJECT_STORAGE+su.storage))
105
106 @synchronized (INDEX_LOCK)
107 def _index_event_real (self, event):
108@@ -766,6 +796,3 @@
109
110 return "%s..%sms" % (time_range.begin, time_range.end)
111
112-if __name__ == "__main__":
113- indexer = Indexer(None)
114- print indexer._compile_filter_query([Event.new_for_values(subject_interpretation="http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#Document")])

Subscribers

People subscribed via source and target branches