| 1 | #!/usr/bin/python |
|---|
| 2 | # -*- coding: UTF-8 -*- |
|---|
| 3 | # |
|---|
| 4 | # Copyright (c) 2007 Tarek Ziadé |
|---|
| 5 | # |
|---|
| 6 | # Authors: |
|---|
| 7 | # Tarek Ziadé <tarek@ziade.org> |
|---|
| 8 | # |
|---|
| 9 | # This program is free software; you can redistribute it and/or |
|---|
| 10 | # modify it under the terms of the GNU General Public License |
|---|
| 11 | # as published by the Free Software Foundation; either version 2 |
|---|
| 12 | # of the License, or (at your option) any later version. |
|---|
| 13 | # |
|---|
| 14 | # This program is distributed in the hope that it will be useful, |
|---|
| 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 17 | # GNU General Public License for more details. |
|---|
| 18 | # |
|---|
| 19 | # You should have received a copy of the GNU General Public License |
|---|
| 20 | # along with this program; if not, write to the Free Software |
|---|
| 21 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
|---|
| 22 | # $Id: tables.py 1518 2007-05-21 12:35:44Z rage $ |
|---|
| 23 | """ will filter entries, with a bayesian filter |
|---|
| 24 | |
|---|
| 25 | two modes: |
|---|
| 26 | - interactive: training |
|---|
| 27 | - direct |
|---|
| 28 | """ |
|---|
| 29 | import os |
|---|
| 30 | import sys |
|---|
| 31 | |
|---|
| 32 | from filtering import register_filter |
|---|
| 33 | |
|---|
| 34 | from BayesCore.classifier import BayesClassifier |
|---|
| 35 | from BayesCore.storage import SQLStorage |
|---|
| 36 | from BayesCore.tokenizer import AllFilters |
|---|
| 37 | |
|---|
| 38 | # XXX todo: conf file for language |
|---|
| 39 | LANG = 'fr' |
|---|
| 40 | |
|---|
| 41 | current_dir = os.path.dirname(__file__) |
|---|
| 42 | |
|---|
| 43 | if current_dir =='': |
|---|
| 44 | current_dir = '.' |
|---|
| 45 | |
|---|
| 46 | sys.curdir = current_dir |
|---|
| 47 | |
|---|
| 48 | |
|---|
| 49 | SQLURI = 'sqlite:///filters/BayesCore/data/bayes.db' |
|---|
| 50 | |
|---|
| 51 | def bayesian(entry, entries, sqluri=None): |
|---|
| 52 | """uses bayesian inference over entries""" |
|---|
| 53 | content = entry['content'].encode('utf8') |
|---|
| 54 | |
|---|
| 55 | if 'title' in entry.keys(): |
|---|
| 56 | title = entry['title'].encode('utf8') |
|---|
| 57 | else: |
|---|
| 58 | title = '' |
|---|
| 59 | |
|---|
| 60 | data = '%s %s' % (content, title) |
|---|
| 61 | |
|---|
| 62 | if sqluri is None: |
|---|
| 63 | sqluri = SQLURI |
|---|
| 64 | |
|---|
| 65 | classifier = BayesClassifier(LANG, SQLStorage(sqluri), AllFilters()) |
|---|
| 66 | |
|---|
| 67 | # let's test the entry |
|---|
| 68 | result = classifier.guess(data) |
|---|
| 69 | |
|---|
| 70 | if result == []: |
|---|
| 71 | return True |
|---|
| 72 | |
|---|
| 73 | |
|---|
| 74 | return result[0][0] == 'nojunk' |
|---|
| 75 | |
|---|
| 76 | register_filter(bayesian) |
|---|
| 77 | |
|---|
| 78 | def bayesian_learn(entry, sqluri=None, answer=None): |
|---|
| 79 | """uses bayesian inference over entries""" |
|---|
| 80 | content = entry['content'].encode('utf8') |
|---|
| 81 | |
|---|
| 82 | if 'title' in entry.keys(): |
|---|
| 83 | title = entry['title'].encode('utf8') |
|---|
| 84 | else: |
|---|
| 85 | title = '' |
|---|
| 86 | |
|---|
| 87 | |
|---|
| 88 | if sqluri is None: |
|---|
| 89 | sqluri = SQLURI |
|---|
| 90 | |
|---|
| 91 | data = '%s %s' % (content, title) |
|---|
| 92 | |
|---|
| 93 | classifier = BayesClassifier(LANG, SQLStorage(sqluri), AllFilters()) |
|---|
| 94 | |
|---|
| 95 | if answer is None: |
|---|
| 96 | print 'title : %s' % title |
|---|
| 97 | |
|---|
| 98 | res = raw_input("Interesting (type 'm' for more) ? (y/n) ") |
|---|
| 99 | answer = res.strip().lower() |
|---|
| 100 | |
|---|
| 101 | if answer == 'm': |
|---|
| 102 | print content |
|---|
| 103 | answer = raw_input("Interesting ? (y/n) ") |
|---|
| 104 | |
|---|
| 105 | print 'Learning...' |
|---|
| 106 | |
|---|
| 107 | if answer.strip().lower() in ('y', 'yes'): |
|---|
| 108 | classifier.learn(data, 'nojunk') |
|---|
| 109 | else: |
|---|
| 110 | classifier.learn(data, 'junk') |
|---|
| 111 | |
|---|