1 : #ifndef EPT_TEXTSEARCH_TEXTSEARCH_H
2 : #define EPT_TEXTSEARCH_TEXTSEARCH_H
3 :
4 : /** @file
5 : * @author Enrico Zini <enrico@enricozini.org>
6 : * Fast full-text search
7 : */
8 :
9 : /*
10 : * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
11 : *
12 : * This program is free software; you can redistribute it and/or modify
13 : * it under the terms of the GNU General Public License as published by
14 : * the Free Software Foundation; either version 2 of the License, or
15 : * (at your option) any later version.
16 : *
17 : * This program is distributed in the hope that it will be useful,
18 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 : * GNU General Public License for more details.
21 : *
22 : * You should have received a copy of the GNU General Public License
23 : * along with this program; if not, write to the Free Software
24 : * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 : */
26 :
27 : #include <xapian.h>
28 : #include <vector>
29 : #include <string>
30 :
31 : namespace ept {
32 : namespace apt {
33 : class Apt;
34 : class PackageRecord;
35 : }
36 : namespace debtags {
37 : class Debtags;
38 : }
39 : namespace textsearch {
40 :
41 : // Allocate value indexes for known values
42 : const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1;
43 : const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2;
44 : const Xapian::valueno VAL_POPCON = 10;
45 : const Xapian::valueno VAL_ITERATING_RATING = 20;
46 : const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
47 : const Xapian::valueno VAL_ITERATING_USABILITY = 22;
48 : const Xapian::valueno VAL_ITERATING_SECURITY = 23;
49 : const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24;
50 : const Xapian::valueno VAL_ITERATING_QUALITY = 25;
51 : const Xapian::valueno VAL_ITERATING_SUPPORT = 26;
52 : const Xapian::valueno VAL_ITERATING_ADOPTION = 27;
53 : // If you need to index a value and cannot edit this file, feel free to use any
54 : // value starting from 1000000
55 :
56 :
57 : /*
58 : Fallback on apt scan searches when index is not present
59 :
60 : Explicitly decide at instantiation (or at any other time) if a rebuild should
61 : be performed. Just adding a 'rebuildIfNeeded' method would be enough.
62 :
63 : 17:14 #xapian < enrico> Hello. I'm finally in a position of writing a library to maintain
64 : a xapian index with Debian package descriptions in a Debian system
65 : 17:14 #xapian < enrico> I have a question, though
66 : 17:14 #xapian < enrico> The descriptions change regularly as people run 'apt-get update'
67 : 17:15 #xapian < enrico> I'd need to have a way to update the description index after
68 : apt-get update, without rebuilding it from scratch
69 : 17:15 #xapian < enrico> Is there some documentation on how to do that? I can't exactly
70 : tell Xapian "the new description for package foo is this" because
71 : I'd need the xapian id
72 : 19:11 #xapian < omega> you can add a unique term with a boolean prefix?
73 : 19:11 #xapian < omega> like Qpackage-name
74 : 19:11 #xapian < omega> then you search for it and replace_document
75 : 19:24 #xapian < richardb> Or indeed, you use the "replace_document()" form which takes a
76 : unique_id term.
77 : 19:25 #xapian < richardb> Xapian::docid replace_document(const std::string &
78 : unique_term,
79 : 19:25 #xapian < richardb> const Xapian::Document &
80 : document);
81 : 19:43 #xapian < enrico> unique term
82 : 19:43 #xapian < enrico> nice!
83 : 19:44 #xapian < enrico> can I use a non-alpha prefix, like :package-name ?
84 : 19:45 #xapian < enrico> or pkg:package-name
85 : 19:45 #xapian < enrico> I suppose I can
86 : */
87 :
88 : /**
89 : * Maintains and accesses a Xapian index of package descriptions.
90 : *
91 : * Contrarily to Debtags and Popcon, TextSearch does not attempt to create the
92 : * index in the home directory if no system index is found and it is not
93 : * running as root: this is to avoid secretly building large indexes (>50Mb)
94 : * in the home directory of users.
95 : *
96 : * The idea then is to have root keep the index up to date, possibly running a
97 : * reindexing tool once a day, or after an apt-get update.
98 : *
99 : * This works because the full text search index is useful even if it is
100 : * slightly out of date.
101 : */
102 : class TextSearch
103 11 : {
104 : protected:
105 : time_t m_timestamp;
106 : Xapian::Database m_db;
107 : Xapian::Stem m_stem;
108 :
109 : /// Return a lowercased copy of the string
110 : static std::string toLower(const std::string& str);
111 :
112 : /**
113 : * Add normalised tokens computed from the string to the document doc.
114 : *
115 : * pos is used as a sequence generator for entering the token position in
116 : * the document.
117 : */
118 : void normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const;
119 :
120 : public:
121 : struct ExtraIndexer
122 : {
123 0 : virtual ~ExtraIndexer() {}
124 : virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const = 0;
125 : };
126 :
127 : TextSearch();
128 :
129 : /// Access the Xapian database
130 3 : Xapian::Database& db() { return m_db; }
131 :
132 : /// Access the Xapian database
133 4 : const Xapian::Database& db() const { return m_db; }
134 :
135 : /// Timestamp of when the Xapian database was last updated
136 3 : time_t timestamp() const { return m_timestamp; }
137 :
138 : /// Returns true if the index has data
139 3 : bool hasData() const { return m_timestamp > 0; }
140 :
141 : /// Returns true if the index is older than the Apt database information
142 : bool needsRebuild(apt::Apt& apt);
143 :
144 : /**
145 : * Rebuild the index if needed.
146 : *
147 : * Allow to specify functors that contribute to the indexing.
148 : *
149 : * @note This requires write access to the index directory.
150 : * @note This is not the main way to update the index: it is provided here
151 : * only as a way to build a draft index for the library tests
152 : */
153 : bool rebuildIfNeeded(
154 : apt::Apt& apt,
155 : const std::vector<const ExtraIndexer*>& extraIndexers = std::vector<const ExtraIndexer*>());
156 :
157 : /**
158 : * Retrieve a Xapian docid by package name
159 : */
160 : Xapian::docid docidByName(const std::string& pkgname) const;
161 :
162 : /**
163 : * Tokenize the string and build an OR query with the resulting keywords
164 : */
165 : Xapian::Query makeORQuery(const std::string& keywords) const;
166 :
167 : /**
168 : * Tokenize the string and build an OR query with the resulting keywords.
169 : *
170 : * The last token in keywords is considered to be typed only partially, to
171 : * implement proper search-as-you-type.
172 : */
173 : Xapian::Query makePartialORQuery(const std::string& keywords) const;
174 :
175 : /**
176 : * Build a query with the given keywords, specified as iterators of strings
177 : */
178 : template<typename ITER>
179 3 : Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const
180 : {
181 3 : std::vector<std::string> terms;
182 : // Insert both the lowercased and the stemmed lowercased query terms
183 10 : for (ITER i = begin; i != end; ++i)
184 : {
185 7 : std::string t = toLower(*i);
186 7 : std::string s = m_stem(t);
187 7 : terms.push_back(t);
188 7 : if (s != t)
189 0 : terms.push_back("Z" + s);
190 : }
191 3 : return Xapian::Query(Xapian::Query::OP_OR, terms.begin(), terms.end());
192 : }
193 :
194 : /// Return a list of tag-based terms that can be used to expand an OR query
195 : std::vector<std::string> expand(Xapian::Enquire& enq) const;
196 :
197 : // std::vector<std::string> similar(const std::string& pkg);
198 :
199 : /**
200 : * Create a query to look for packages similar to the given one
201 : */
202 : Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
203 :
204 : /**
205 : * Get the integer value for
206 : */
207 : double getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const;
208 :
209 : /**
210 : * Get the integer value for
211 : */
212 : int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const;
213 : };
214 :
215 : }
216 : }
217 :
218 : // vim:set ts=4 sw=4:
219 : #endif
|