| 1 | # -*- coding: utf8 -*- |
|---|
| 2 | # Copyright (c) 2006 Nuxeo SAS <http://nuxeo.com> |
|---|
| 3 | # Authors : Tarek Ziadé <tziade@nuxeo.com> |
|---|
| 4 | # This program is free software; you can redistribute it and/or modify |
|---|
| 5 | # it under the terms of the GNU General Public License version 2 as published |
|---|
| 6 | # by the Free Software Foundation. |
|---|
| 7 | # |
|---|
| 8 | # This program is distributed in the hope that it will be useful, |
|---|
| 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 11 | # GNU General Public License for more details. |
|---|
| 12 | # |
|---|
| 13 | # You should have received a copy of the GNU General Public License |
|---|
| 14 | # along with this program; if not, write to the Free Software |
|---|
| 15 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
|---|
| 16 | # 02111-1307, USA. |
|---|
| 17 | # |
|---|
| 18 | # $Id: classifier.py 47012 2006-07-10 08:57:07Z tziade $ |
|---|
| 19 | """Robinson-fisher method was taken from PopF |
|---|
| 20 | """ |
|---|
| 21 | import logging |
|---|
| 22 | import math |
|---|
| 23 | import operator |
|---|
| 24 | |
|---|
| 25 | logger = logging.getLogger('BayesCore.classifier') |
|---|
| 26 | |
|---|
| 27 | class BayesClassifier(object): |
|---|
| 28 | |
|---|
| 29 | def __init__(self, language, backend, tokenizer, **options): |
|---|
| 30 | self.language = language |
|---|
| 31 | self.backend = backend |
|---|
| 32 | self.tokenizer = tokenizer |
|---|
| 33 | self._learnt = True |
|---|
| 34 | self._probs = None |
|---|
| 35 | |
|---|
| 36 | if options is None: |
|---|
| 37 | self.options = {'lang': self.language} |
|---|
| 38 | else: |
|---|
| 39 | self.options = options |
|---|
| 40 | self.options['lang'] = self.language |
|---|
| 41 | |
|---|
| 42 | def learn(self, data, category): |
|---|
| 43 | """ learn data for a given category |
|---|
| 44 | |
|---|
| 45 | wich means: store words in categories |
|---|
| 46 | """ |
|---|
| 47 | self._learnt = True |
|---|
| 48 | self.backend.add_category(name=category) |
|---|
| 49 | data = self.tokenizer.transform(data, self.options) |
|---|
| 50 | for element in data: |
|---|
| 51 | self.backend.add_word(element, self.language, category) |
|---|
| 52 | |
|---|
| 53 | def unlearn(self, data, category): |
|---|
| 54 | """ ulearn data for a given category |
|---|
| 55 | |
|---|
| 56 | wich means: remove words from categories |
|---|
| 57 | """ |
|---|
| 58 | self._learnt = True |
|---|
| 59 | data = self.tokenizer.transform(data, self.options) |
|---|
| 60 | for element in data: |
|---|
| 61 | self.backend.del_word(element, category) |
|---|
| 62 | |
|---|
| 63 | def guess(self, data): |
|---|
| 64 | """ guess a category for a given data """ |
|---|
| 65 | options = {'lang': self.language} |
|---|
| 66 | data = self.tokenizer.transform(data, self.options) |
|---|
| 67 | |
|---|
| 68 | # XXX this will be cached |
|---|
| 69 | if self._learnt: |
|---|
| 70 | self._probs = self._buildWordProbabilities() |
|---|
| 71 | else: |
|---|
| 72 | if self._probs is None: |
|---|
| 73 | self._probs = self._buildWordProbabilities() |
|---|
| 74 | |
|---|
| 75 | probabilities = self._probs |
|---|
| 76 | self._learnt = False |
|---|
| 77 | |
|---|
| 78 | res = {} |
|---|
| 79 | for category_name in probabilities: |
|---|
| 80 | category_probs = probabilities[category_name] |
|---|
| 81 | p = self.getProbs(category_probs, data) |
|---|
| 82 | if len(p) != 0: |
|---|
| 83 | res[category_name] = self._robinsonFisher(p, category_name) |
|---|
| 84 | res = res.items() |
|---|
| 85 | res.sort(lambda x,y: cmp(y[1], x[1])) |
|---|
| 86 | return res |
|---|
| 87 | |
|---|
| 88 | def getProbs(self, category_probs, words): |
|---|
| 89 | """ extracts the probabilities of tokens in a message |
|---|
| 90 | """ |
|---|
| 91 | probs = [(word, category_probs[word]) |
|---|
| 92 | for word in words if word in category_probs] |
|---|
| 93 | probs.sort(lambda x,y: cmp(y[1],x[1])) |
|---|
| 94 | return probs[:2048] |
|---|
| 95 | |
|---|
| 96 | def _robinsonFisher(self, probs, ignore): |
|---|
| 97 | """ computes the probability of a message being spam (Robinson-Fisher method) |
|---|
| 98 | H = C-1( -2.ln(prod(p)), 2*n ) |
|---|
| 99 | S = C-1( -2.ln(prod(1-p)), 2*n ) |
|---|
| 100 | I = (1 + H - S) / 2 |
|---|
| 101 | Courtesy of http://christophe.delord.free.fr/en/index.html |
|---|
| 102 | """ |
|---|
| 103 | def _chi2P(chi, df): |
|---|
| 104 | """ return P(chisq >= chi, with df degree of freedom) |
|---|
| 105 | |
|---|
| 106 | df must be even |
|---|
| 107 | """ |
|---|
| 108 | assert df & 1 == 0 |
|---|
| 109 | m = chi / 2.0 |
|---|
| 110 | sum = term = math.exp(-m) |
|---|
| 111 | for i in range(1, df/2): |
|---|
| 112 | term *= m/i |
|---|
| 113 | sum += term |
|---|
| 114 | return min(sum, 1.0) |
|---|
| 115 | |
|---|
| 116 | n = len(probs) |
|---|
| 117 | try: |
|---|
| 118 | mlog = math.log(reduce(operator.mul, map(lambda p: p[1], probs), 1.0)) |
|---|
| 119 | H = _chi2P(-2.0 * mlog, 2*n) |
|---|
| 120 | except OverflowError: |
|---|
| 121 | H = 0.0 |
|---|
| 122 | try: |
|---|
| 123 | mlog = math.log(reduce(operator.mul, map(lambda p: 1.0-p[1], probs), 1.0)) |
|---|
| 124 | S = _chi2P(-2.0 * mlog, 2*n) |
|---|
| 125 | except OverflowError: |
|---|
| 126 | S = 0.0 |
|---|
| 127 | return (1 + H - S) / 2 |
|---|
| 128 | |
|---|
| 129 | def corpusSize(self, language=None): |
|---|
| 130 | |
|---|
| 131 | return self.backend.word_count(language=language) |
|---|
| 132 | |
|---|
| 133 | def categorySize(self, category, language=None): |
|---|
| 134 | return self.backend.word_count(category=category, language=language) |
|---|
| 135 | |
|---|
| 136 | def _buildWordProbabilities(self, language=None): |
|---|
| 137 | probs = {} |
|---|
| 138 | corpus_size = self.corpusSize(language) |
|---|
| 139 | words = list(self.backend.list_words(language, complete=True)) |
|---|
| 140 | for cat in self.backend.list_categories(): |
|---|
| 141 | |
|---|
| 142 | probs[cat] = self._buildCategoryWordProbabilities(cat, language, |
|---|
| 143 | corpus_size, words) |
|---|
| 144 | return probs |
|---|
| 145 | |
|---|
| 146 | def _buildCategoryWordProbabilities(self, category, language=None, |
|---|
| 147 | corpus_size=None, words=None): |
|---|
| 148 | """Merges corpora and computes probabilities |
|---|
| 149 | |
|---|
| 150 | XXX to be cached later (invalidation on word adding) |
|---|
| 151 | """ |
|---|
| 152 | if corpus_size is None: |
|---|
| 153 | corpus_size = self.corpusSize(language) |
|---|
| 154 | if words is None: |
|---|
| 155 | words = self.backend.list_words(language, complete=True) |
|---|
| 156 | if language is None: |
|---|
| 157 | language = self.language |
|---|
| 158 | |
|---|
| 159 | category_size = float(self.categorySize(category, language)) |
|---|
| 160 | them_count = float(max(corpus_size - category_size, 1)) |
|---|
| 161 | probabilities = {} |
|---|
| 162 | |
|---|
| 163 | for word in words: |
|---|
| 164 | the_word = word[1][1] |
|---|
| 165 | |
|---|
| 166 | if category not in the_word.keys(): |
|---|
| 167 | continue |
|---|
| 168 | |
|---|
| 169 | word_count = float(word[1][2]) |
|---|
| 170 | if word_count == 0.0: |
|---|
| 171 | continue |
|---|
| 172 | |
|---|
| 173 | cat_word_count = float(the_word[category]) |
|---|
| 174 | other_count = word_count - cat_word_count |
|---|
| 175 | |
|---|
| 176 | if category_size == 0: |
|---|
| 177 | good_metric = 1.0 |
|---|
| 178 | else: |
|---|
| 179 | good_metric = min(1.0, other_count/category_size) |
|---|
| 180 | |
|---|
| 181 | if them_count == 0: |
|---|
| 182 | continue |
|---|
| 183 | |
|---|
| 184 | bad_metric = min(1.0, cat_word_count/them_count) |
|---|
| 185 | |
|---|
| 186 | try: |
|---|
| 187 | f = bad_metric / (good_metric + bad_metric) |
|---|
| 188 | except ZeroDivisionError: |
|---|
| 189 | continue |
|---|
| 190 | |
|---|
| 191 | # PROBABILITY_THRESHOLD |
|---|
| 192 | if abs(f-0.5) >= 0.1 : |
|---|
| 193 | # GOOD_PROB, BAD_PROB |
|---|
| 194 | probabilities[word[0]] = max(0.0001, min(0.9999, f)) |
|---|
| 195 | |
|---|
| 196 | return probabilities |
|---|