1 :
2 : /** @file
3 : * @author Enrico Zini <enrico@enricozini.org>
4 : * Fast full-text search
5 : */
6 :
7 : /*
8 : * Copyright (C) 2007 Enrico Zini <enrico@debian.org>
9 : *
10 : * This program is free software; you can redistribute it and/or modify
11 : * it under the terms of the GNU General Public License as published by
12 : * the Free Software Foundation; either version 2 of the License, or
13 : * (at your option) any later version.
14 : *
15 : * This program is distributed in the hope that it will be useful,
16 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 : * GNU General Public License for more details.
19 : *
20 : * You should have received a copy of the GNU General Public License
21 : * along with this program; if not, write to the Free Software
22 : * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 : */
24 :
25 : #include <ept/textsearch/textsearch.h>
26 : #include <ept/textsearch/maint/path.h>
27 : #include <ept/apt/apt.h>
28 : #include <ept/apt/packagerecord.h>
29 : //#include <ept/debtags/debtags.h>
30 :
31 : #include <wibble/regexp.h>
32 : #include <cctype>
33 : #include <cmath>
34 :
35 : #include <xapian/queryparser.h>
36 :
37 : #include <algorithm>
38 :
39 : #include <iostream>
40 :
41 : using namespace std;
42 : using namespace ept::apt;
43 : using namespace ept::debtags;
44 :
45 : namespace ept {
46 : namespace textsearch {
47 :
48 : size_t max_index = 0;
49 :
50 11 : TextSearch::TextSearch()
51 11 : : m_timestamp(0), m_stem("en")
52 : {
53 22 : m_timestamp = Path::indexTimestamp();
54 11 : if (m_timestamp)
55 9 : m_db.add_database(Xapian::Database(Path::index()));
56 11 : }
57 :
58 7 : std::string TextSearch::toLower(const std::string& str)
59 : {
60 7 : std::string res;
61 7 : res.reserve(str.size());
62 44 : for (std::string::const_iterator i = str.begin(); i != str.end(); ++i)
63 37 : res += tolower(*i);
64 0 : return res;
65 : }
66 :
67 3 : bool TextSearch::needsRebuild(apt::Apt& apt)
68 : {
69 3 : return apt.timestamp() > m_timestamp;
70 : }
71 :
72 0 : void TextSearch::normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const
73 : {
74 0 : string t = TextSearch::toLower(term);
75 0 : string s = m_stem(t);
76 0 : doc.add_term(t);
77 0 : if (s != t)
78 0 : doc.add_term(s);
79 0 : }
80 :
81 10 : bool TextSearch::rebuildIfNeeded(apt::Apt& apt, const std::vector<const TextSearch::ExtraIndexer*>& extraIndexers)
82 : {
83 : // Check if a rebuild is needed, and keep a copy of the APT timestamp for
84 : // saving later
85 10 : time_t aptts = apt.timestamp();
86 10 : if (aptts <= m_timestamp)
87 4 : return false;
88 :
89 : // Reindex
90 6 : Xapian::WritableDatabase database(Xapian::Flint::open(Path::index(), Xapian::DB_CREATE_OR_OPEN));
91 6 : Xapian::TermGenerator termgen;
92 6 : termgen.set_stemmer(m_stem);
93 : //database.begin_transaction();
94 6 : PackageRecord rec;
95 6 : size_t count = 0;
96 2610 : for (Apt::record_iterator i = apt.recordBegin();
97 : i != apt.recordEnd(); ++i, ++count)
98 : {
99 : // If we are testing, we can set a limit to how many packages we index,
100 : // to avoid it taking too much time
101 2604 : if (max_index != 0 && count > max_index)
102 0 : break;
103 :
104 2604 : rec.scan(*i);
105 :
106 2604 : Xapian::Document doc;
107 2604 : doc.set_data(rec.package());
108 :
109 2604 : string pkgid = "XP" + rec.package();
110 : //std::cerr << "Add " << pkgid << ": " << idx << std::endl;
111 2604 : doc.add_term(pkgid);
112 :
113 : // Index tags as well
114 2604 : set<string> tags = rec.tag();
115 2604 : for (set<string>::const_iterator ti = tags.begin();
116 : ti != tags.end(); ++ti)
117 0 : doc.add_term("XT"+*ti);
118 :
119 2604 : termgen.set_document(doc);
120 2604 : termgen.index_text_without_positions(rec.package());
121 5208 : termgen.index_text_without_positions(rec.description());
122 :
123 : // Add the values
124 5208 : doc.add_value(VAL_APT_INSTALLED_SIZE, Xapian::sortable_serialise(rec.installedSize()));
125 2604 : doc.add_value(VAL_APT_PACKAGE_SIZE, Xapian::sortable_serialise(rec.packageSize()));
126 :
127 2604 : if (m_timestamp)
128 2170 : database.replace_document(pkgid, doc);
129 : else
130 434 : database.add_document(doc);
131 6 : }
132 :
133 : //database.commit_transaction();
134 :
135 6 : if (!m_timestamp)
136 1 : m_db.add_database(Xapian::Database(Path::index()));
137 : else
138 5 : m_db.reopen();
139 :
140 6 : m_timestamp = aptts;
141 :
142 6 : Path::setTimestamp(aptts);
143 :
144 6 : return true;
145 : }
146 :
147 2 : Xapian::Query TextSearch::makeORQuery(const std::string& keywords) const
148 : {
149 2 : wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED);
150 4 : return makeORQuery(tok.begin(), tok.end());
151 : }
152 :
153 1 : Xapian::Query TextSearch::makePartialORQuery(const std::string& keywords) const
154 : {
155 1 : wibble::Tokenizer tok(keywords, "[A-Za-z0-9_-]+", REG_EXTENDED);
156 1 : vector<string> tokens;
157 : // FIXME: make the Tokenizer iterators properly iterable
158 2 : for (wibble::Tokenizer::const_iterator i = tok.begin();
159 : i != tok.end(); ++i)
160 1 : tokens.push_back(*i);
161 : // Add all the terms starting with 'last'
162 1 : if (!tokens.empty())
163 : {
164 1 : string& last = *tokens.rbegin();
165 1 : if (last.size() == 1)
166 : // Ignore one-letter partial terms: they make the query uselessly
167 : // large and slow, and it's worth just to wait for more characters
168 : // to come
169 0 : tokens.resize(tokens.size() - 1);
170 : else
171 1 : copy(m_db.allterms_begin(last), m_db.allterms_end(last), back_inserter(tokens));
172 : /*
173 : for (Xapian::TermIterator t = m_db.allterms_begin(last);
174 : t != m_db.allterms_end(last); ++t)
175 : tokens.push_back(*t);
176 : */
177 : }
178 1 : return makeORQuery(tokens.begin(), tokens.end());
179 : }
180 :
181 10 : Xapian::docid TextSearch::docidByName(const std::string& pkgname) const
182 : {
183 10 : Xapian::PostingIterator i = m_db.postlist_begin("XP"+pkgname);
184 10 : if (i == m_db.postlist_end("XP"+pkgname))
185 5 : return 0;
186 : else
187 5 : return *i;
188 : }
189 :
190 : struct TagFilter : public Xapian::ExpandDecider
191 4 : {
192 0 : virtual bool operator()(const std::string &term) const { return term[0] == 'T'; }
193 : };
194 :
195 2 : static TagFilter tagFilter;
196 :
197 0 : vector<string> TextSearch::expand(Xapian::Enquire& enq) const
198 : {
199 0 : Xapian::RSet rset;
200 : // Get the top 5 results as 'good ones' to compute the search expansion
201 0 : Xapian::MSet mset = enq.get_mset(0, 5);
202 0 : for (Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i)
203 0 : rset.add_document(i);
204 : // Get the expanded set, only expanding the query with tag names
205 0 : Xapian::ESet eset = enq.get_eset(5, rset, &tagFilter);
206 0 : vector<string> res;
207 0 : for (Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i)
208 0 : res.push_back(*i);
209 0 : return res;
210 : }
211 :
212 0 : Xapian::Query TextSearch::makeRelatedQuery(const std::string& pkgname) const
213 : {
214 0 : Xapian::Enquire enquire(db());
215 :
216 : // Retrieve the document for the given package
217 0 : enquire.set_query(Xapian::Query("XP"+pkgname));
218 0 : Xapian::MSet matches = enquire.get_mset(0, 1);
219 0 : Xapian::MSetIterator mi = matches.begin();
220 0 : if (mi == matches.end()) return Xapian::Query();
221 0 : Xapian::Document doc = mi.get_document();
222 :
223 : // Return the query to get the list of similar documents
224 0 : return Xapian::Query(Xapian::Query::OP_OR, doc.termlist_begin(), doc.termlist_end());
225 : }
226 :
227 4 : double TextSearch::getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const
228 : {
229 4 : Xapian::docid id = docidByName(pkgname);
230 4 : if (id == 0)
231 2 : return 0.0;
232 2 : Xapian::Document doc = db().get_document(id);
233 2 : string val = doc.get_value(val_id);
234 2 : if (val.empty())
235 0 : return 0.0;
236 : else
237 2 : return Xapian::sortable_unserialise(val);
238 : }
239 :
240 4 : int TextSearch::getIntValue(const std::string& pkgname, Xapian::valueno val_id) const
241 : {
242 4 : Xapian::docid id = docidByName(pkgname);
243 4 : if (id == 0)
244 2 : return 0;
245 2 : Xapian::Document doc = db().get_document(id);
246 2 : string val = doc.get_value(val_id);
247 2 : if (val.empty())
248 0 : return 0;
249 : else
250 2 : return (int)nearbyint(Xapian::sortable_unserialise(val));
251 : }
252 :
253 : }
254 6 : }
255 :
256 : // vim:set ts=4 sw=4:
|