| 1 | #!/usr/bin/python |
|---|
| 2 | # -*- coding: UTF-8 -*- |
|---|
| 3 | # |
|---|
| 4 | # Copyright (c) 2007 Tarek Ziadé |
|---|
| 5 | # |
|---|
| 6 | # Authors: |
|---|
| 7 | # Tarek Ziadé <tarek@ziade.org> |
|---|
| 8 | # |
|---|
| 9 | # This program is free software; you can redistribute it and/or |
|---|
| 10 | # modify it under the terms of the GNU General Public License |
|---|
| 11 | # as published by the Free Software Foundation; either version 2 |
|---|
| 12 | # of the License, or (at your option) any later version. |
|---|
| 13 | # |
|---|
| 14 | # This program is distributed in the hope that it will be useful, |
|---|
| 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 17 | # GNU General Public License for more details. |
|---|
| 18 | # |
|---|
| 19 | # You should have received a copy of the GNU General Public License |
|---|
| 20 | # along with this program; if not, write to the Free Software |
|---|
| 21 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
|---|
| 22 | """ Searcher |
|---|
| 23 | """ |
|---|
| 24 | |
|---|
| 25 | import os |
|---|
| 26 | import logging |
|---|
| 27 | |
|---|
| 28 | import xapian |
|---|
| 29 | from tokenizer import tokenize |
|---|
| 30 | from settings import DB_FILE |
|---|
| 31 | from model import statistics |
|---|
| 32 | |
|---|
| 33 | def read_only(): |
|---|
| 34 | return xapian.flint_open(DB_FILE) |
|---|
| 35 | |
|---|
| 36 | def corpus_size(): |
|---|
| 37 | """retrieves number of docs""" |
|---|
| 38 | db = read_only() |
|---|
| 39 | return db.get_doccount() |
|---|
| 40 | |
|---|
| 41 | def _get_document_internal_id(uid): |
|---|
| 42 | """retrieves a document""" |
|---|
| 43 | enquire = xapian.Enquire(read_only()) |
|---|
| 44 | query = xapian.Query('Q%s' % uid) |
|---|
| 45 | enquire.set_query(query) |
|---|
| 46 | res = list(enquire.get_mset(0, 1)) |
|---|
| 47 | if len(res) == 0: |
|---|
| 48 | return None |
|---|
| 49 | return res[0].docid |
|---|
| 50 | |
|---|
| 51 | def document_exists(uid): |
|---|
| 52 | """tels if the document exists""" |
|---|
| 53 | return _get_document_internal_id(uid) is not None |
|---|
| 54 | |
|---|
| 55 | def document_terms(uid): |
|---|
| 56 | """retrieves terms""" |
|---|
| 57 | db = read_only() |
|---|
| 58 | docid = _get_document_internal_id(uid) |
|---|
| 59 | if docid is not None: |
|---|
| 60 | return (el.term for el in read_only().get_document(docid).termlist() |
|---|
| 61 | if el.term != 'Q%s' % uid) |
|---|
| 62 | return None |
|---|
| 63 | |
|---|
| 64 | def search(query, or_=False, language=None): |
|---|
| 65 | """search""" |
|---|
| 66 | logging.debug('searching for "%s"' % query) |
|---|
| 67 | |
|---|
| 68 | db = read_only() |
|---|
| 69 | options = {'treshold': 2} |
|---|
| 70 | if language is not None: |
|---|
| 71 | options['lang'] = language |
|---|
| 72 | |
|---|
| 73 | tquery = tokenize(query, options=options) |
|---|
| 74 | enquire = xapian.Enquire(db) |
|---|
| 75 | if or_: |
|---|
| 76 | op = xapian.Query.OP_OR |
|---|
| 77 | else: |
|---|
| 78 | op = xapian.Query.OP_AND |
|---|
| 79 | |
|---|
| 80 | xquery = xapian.Query(op, tquery) |
|---|
| 81 | enquire.set_query(xquery) |
|---|
| 82 | res = enquire.get_mset(0, 100) |
|---|
| 83 | |
|---|
| 84 | def _extract_uid(result): |
|---|
| 85 | # buuuu |
|---|
| 86 | ids = [t.term for t in result.document.termlist() |
|---|
| 87 | if t.term.startswith('Q')] |
|---|
| 88 | if len(ids) > 0: |
|---|
| 89 | return ids[0][1:] |
|---|
| 90 | return None |
|---|
| 91 | |
|---|
| 92 | logging.debug('searching for "%s" is over' % query) |
|---|
| 93 | |
|---|
| 94 | stat = statistics.select(statistics.c.query==query).execute().fetchone() |
|---|
| 95 | |
|---|
| 96 | if stat is not None: |
|---|
| 97 | count = stat.count |
|---|
| 98 | stat.close() |
|---|
| 99 | statistics.update(statistics.c.query==query).execute(count=count+1) |
|---|
| 100 | else: |
|---|
| 101 | statistics.insert().execute(query=query, count=1) |
|---|
| 102 | |
|---|
| 103 | return (_extract_uid(el) for el in res) |
|---|
| 104 | |
|---|