Changeset 162:ca1529c62df9
- Timestamp:
- 08/21/07 19:12:16 (9 months ago)
- Author:
- Tarek Ziad?? <tarek@…>
- Message:
-
fixes
- Location:
- classifier
- Files:
-
Legend:
- Unmodified
- Added
- Removed
-
|
r159
|
r162
|
|
| 31 | 31 | self.backend = backend |
| 32 | 32 | self.tokenizer = tokenizer |
| | 33 | self._learnt = True |
| | 34 | self._probs = None |
| | 35 | |
| 33 | 36 | if options is None: |
| 34 | 37 | self.options = {'lang': self.language} |
| … |
… |
|
| 42 | 45 | wich means: store words in categories |
| 43 | 46 | """ |
| | 47 | self._learnt = True |
| 44 | 48 | self.backend.add_category(name=category) |
| 45 | 49 | data = self.tokenizer.transform(data, self.options) |
| … |
… |
|
| 52 | 56 | wich means: remove words from categories |
| 53 | 57 | """ |
| | 58 | self._learnt = True |
| 54 | 59 | data = self.tokenizer.transform(data, self.options) |
| 55 | 60 | for element in data: |
| 56 | | self.backend.delWord(element, category) |
| | 61 | self.backend.del_word(element, category) |
| 57 | 62 | |
| 58 | 63 | def guess(self, data): |
| … |
… |
|
| 62 | 67 | |
| 63 | 68 | # XXX this will be cached |
| 64 | | probabilities = self._buildWordProbabilities() |
| | 69 | if self._learnt: |
| | 70 | self._probs = self._buildWordProbabilities() |
| | 71 | else: |
| | 72 | if self._probs is None: |
| | 73 | self._probs = self._buildWordProbabilities() |
| | 74 | |
| | 75 | probabilities = self._probs |
| | 76 | self._learnt = False |
| 65 | 77 | |
| 66 | 78 | res = {} |
| … |
… |
|
| 124 | 136 | def _buildWordProbabilities(self, language=None): |
| 125 | 137 | probs = {} |
| | 138 | corpus_size = self.corpusSize(language) |
| | 139 | words = list(self.backend.list_words(language, complete=True)) |
| 126 | 140 | for cat in self.backend.list_categories(): |
| 127 | | probs[cat] = self._buildCategoryWordProbabilities(cat, language) |
| | 141 | |
| | 142 | probs[cat] = self._buildCategoryWordProbabilities(cat, language, |
| | 143 | corpus_size, words) |
| 128 | 144 | return probs |
| 129 | 145 | |
| 130 | | def _buildCategoryWordProbabilities(self, category, language=None): |
| | 146 | def _buildCategoryWordProbabilities(self, category, language=None, |
| | 147 | corpus_size=None, words=None): |
| 131 | 148 | """Merges corpora and computes probabilities |
| 132 | 149 | |
| 133 | 150 | XXX to be cached later (invalidation on word adding) |
| 134 | 151 | """ |
| | 152 | if corpus_size is None: |
| | 153 | corpus_size = self.corpusSize(language) |
| | 154 | if words is None: |
| | 155 | words = self.backend.list_words(language, complete=True) |
| 135 | 156 | if language is None: |
| 136 | 157 | language = self.language |
| 137 | | corpus_size = self.corpusSize(language) |
| | 158 | |
| 138 | 159 | category_size = float(self.categorySize(category, language)) |
| 139 | 160 | them_count = float(max(corpus_size - category_size, 1)) |
| 140 | 161 | probabilities = {} |
| 141 | | words = self.backend.list_words(language, complete=True) |
| 142 | 162 | |
| 143 | 163 | for word in words: |
| 144 | | if category not in word[1][1].keys(): |
| | 164 | the_word = word[1][1] |
| | 165 | |
| | 166 | if category not in the_word.keys(): |
| 145 | 167 | continue |
| 146 | 168 | |
| … |
… |
|
| 149 | 171 | continue |
| 150 | 172 | |
| 151 | | cat_word_count = float(word[1][1][category]) |
| 152 | | other_count = cat_word_count - word_count |
| | 173 | cat_word_count = float(the_word[category]) |
| | 174 | other_count = word_count - cat_word_count |
| 153 | 175 | |
| 154 | 176 | if category_size == 0: |
-
|
r160
|
r162
|
|
| 23 | 23 | >>> tokenizer = AllFilters() |
| 24 | 24 | >>> classifier = BayesClassifier('fr', backend, tokenizer) |
| | 25 | |
| | 26 | Le backend doit être vide au début:: |
| | 27 | |
| | 28 | >>> backend.word_count() |
| | 29 | 0 |
| 25 | 30 | |
| 26 | 31 | Le classificateur fait deux choses: apprendre et deviner, pour une langue |
| … |
… |
|
| 59 | 64 | [(u'achetez', 0.99...), (u'kimouss', 0.99...)] |
| 60 | 65 | |
| 61 | | >>> classifier.learn('savon kimouss par-ci, savon par-la, savon toujours', |
| | 66 | >>> classifier.learn('savon kipouss par-ci, savon par-la, savon toujours', |
| 62 | 67 | ... 'song') |
| 63 | 68 | >>> sorted(classifier._buildCategoryWordProbabilities('spam').items()) #2 |
| 64 | | [(u'achetez', 0.99...), (u'kimouss', 0.99...), (u'savon', 0.0001)] |
| | 69 | [(u'achetez', 0.99...), (u'kimouss', 0.99...), (u'savon', 0.14...)] |
| | 70 | |
| | 71 | >>> classifier.categorySize('song') |
| | 72 | 4 |
| 65 | 73 | |
| 66 | 74 | >>> sorted(classifier._buildCategoryWordProbabilities('song').items()) |
| 67 | | [(u'kimouss', 0.0001), (u'par', 0.99...), (u'savon', 0.99...), (u'toujours', 0.99...)] |
| | 75 | [(u'kipouss', 0.99...), (u'par', 0.99...), (u'toujours', 0.99...)] |
| 68 | 76 | |
| 69 | 77 | Ce calcul est fait pour toutes les catégories:: |
| … |
… |
|
| 73 | 81 | |
| 74 | 82 | |
| 75 | | La reconnaissance se base sur ce filtrage de mots, puis applque l'algo de |
| | 83 | La reconnaissance se base sur ce filtrage de mots, puis applique l'algo de |
| 76 | 84 | Robinson-fisher:: |
| 77 | 85 | |
| 78 | 86 | >>> classifier.guess('achetez mon savon KIPOUSS') |
| 79 | | [(u'song', 0.99...), (u'spam', 0.5), ...] |
| | 87 | [(u'song', 0.99...), (u'spam', 0.70...), (u'friend', 0.16...)] |
| 80 | 88 | |
| 81 | 89 | We lower default treshold first:: |
| … |
… |
|
| 126 | 134 | >>> source2 = open(file).read() |
| 127 | 135 | >>> classifier.guess(source2) |
| 128 | | [(u'python', 1.0), (u'doctest', ...e-...)] |
| | 136 | [(u'python', ...), ...] |
| 129 | 137 | |
| 130 | 138 | Le classificateur doit aussi savoir `désapprendre`:: |
| … |
… |
|
| 133 | 141 | >>> classifier.unlearn(source2, 'python') |
| 134 | 142 | >>> classifier.guess(source2) |
| 135 | | [(u'doctest', 1.0)] |
| | 143 | [(u'doctest', ...), ...] |
| 136 | 144 | |
| 137 | 145 | |
-
|
r157
|
r162
|
|
| 10 | 10 | >>> import settings |
| 11 | 11 | >>> settings.SQLURI = 'sqlite:///%s' % db_file |
| | 12 | >>> import os |
| | 13 | >>> if os.path.exists(db_file): |
| | 14 | ... os.remove(db_file) |
| | 15 | |
| 12 | 16 | |
| 13 | 17 | It works with languages, words, categories, and words within categories:: |
| … |
… |
|
| 18 | 22 | bayesian data:: |
| 19 | 23 | |
| 20 | | >>> storage = SQLStorage('tarek') |
| 21 | | |
| | 24 | >>> storage = SQLStorage('tester') |
| 22 | 25 | |
| 23 | 26 | Next we can store languages, since each word is in a given language:: |
-
|
r159
|
r162
|
|
| 25 | 25 | lang = Table('classifier_lang', _metadata, |
| 26 | 26 | Column('iso', String(3), primary_key=True), |
| 27 | | Column('user', String(40))) |
| | 27 | Column('user', String(40), primary_key=True)) |
| 28 | 28 | |
| 29 | 29 | word = Table('classifier_word', _metadata, |
| … |
… |
|
| 31 | 31 | Column('lang_iso', String(3), ForeignKey('classifier_lang.iso')), |
| 32 | 32 | Column('count', Integer), |
| 33 | | Column('user', String(40)) |
| | 33 | Column('user', String(40), primary_key=True) |
| 34 | 34 | ) |
| 35 | 35 | |
| … |
… |
|
| 37 | 37 | Column('name', String(50), primary_key=True), |
| 38 | 38 | Column('description', String(300)), |
| 39 | | Column('user', String(40))) |
| | 39 | Column('user', String(40), primary_key=True)) |
| 40 | 40 | |
| 41 | 41 | |
| 42 | 42 | word_category = Table('classifier_word_category', _metadata, |
| 43 | | Column('id', Integer, primary_key=True), |
| | 43 | #Column('id', Integer, primary_key=True), |
| 44 | 44 | Column('word_value', String(200), |
| 45 | | ForeignKey('classifier_word.value')), |
| | 45 | ForeignKey('classifier_word.value'), primary_key=True), |
| 46 | 46 | Column('count', Integer), |
| 47 | 47 | Column('category_name', String(50), |
| 48 | | ForeignKey('classifier_category.name')), |
| 49 | | Column('user', String(40))) |
| | 48 | ForeignKey('classifier_category.name'), primary_key=True), |
| | 49 | Column('user', String(40), primary_key=True)) |
| 50 | 50 | |
| 51 | 51 | |
| … |
… |
|
| 151 | 151 | sel = word.select(and_(word.c.value==the_word, |
| 152 | 152 | word.c.user==self._user)) |
| 153 | | selection = sel.execute().fetchall() |
| | 153 | selection = sel.execute().fetchone() |
| 154 | 154 | cat_insert = word_category.insert() |
| 155 | 155 | |
| 156 | | |
| 157 | | if len(selection) == 0: |
| | 156 | if selection is None: |
| 158 | 157 | # new word, let's add it |
| 159 | 158 | res = word.insert().execute(value=the_word, count=1, |
| … |
… |
|
| 169 | 168 | res = word.update(and_(word.c.value==the_word, |
| 170 | 169 | word.c.user==self._user)) |
| 171 | | res.execute(count=selection[0].count+1) |
| 172 | | |
| 173 | | sl = and_(word_category.c.word_value==the_word, |
| 174 | | word_category.c.user==self._user) |
| 175 | | cat = word_category.select(sl) |
| 176 | | cat = cat.execute().fetchall() |
| | 170 | res.execute(count=selection.count+1) |
| | 171 | |
| | 172 | #sl = and_(word_category.c.word_value==the_word, |
| | 173 | # word_category.c.user==self._user) |
| | 174 | #cat = word_category.select(sl) |
| | 175 | #cat = cat.execute().fetchall() |
| 177 | 176 | |
| 178 | 177 | #for old_cat in cat: |
| … |
… |
|
| 185 | 184 | # word_category.delete(sl).execute() |
| 186 | 185 | |
| | 186 | existing_cats = self.list_categories() |
| 187 | 187 | for category in categories: |
| 188 | 188 | # let's check if the category exists in category |
| 189 | | if category not in self.list_categories(): |
| | 189 | if category not in existing_cats: |
| 190 | 190 | self.add_category(category) |
| 191 | 191 | |
| … |
… |
|
| 194 | 194 | word_category.c.word_value==the_word) |
| 195 | 195 | cat = word_category.select(sl) |
| 196 | | cat = cat.execute().fetchall() |
| 197 | | |
| 198 | | if cat == []: |
| | 196 | |
| | 197 | cat = cat.execute().fetchone() |
| | 198 | if cat is None: |
| 199 | 199 | cat_insert.execute(word_value=the_word, |
| 200 | 200 | category_name=category, |
| … |
… |
|
| 202 | 202 | |
| 203 | 203 | else: |
| 204 | | cat_count = cat[0].count |
| 205 | | id_ = cat[0].id |
| 206 | | up = word_category.update(word_category.c.id==id_) |
| 207 | | up.execute(count=cat_count+1) |
| | 204 | up = word_category.update(and_(word_category.c.user==cat.user, |
| | 205 | word_category.c.word_value==cat.word_value, |
| | 206 | word_category.c.category_name==cat.category_name)) |
| | 207 | up.execute(count=cat.count+1) |
| 208 | 208 | |
| 209 | 209 | |
| … |
… |
|
| 213 | 213 | selection = and_(word_category.c.word_value == word_value, |
| 214 | 214 | word_category.c.user==self._user) |
| 215 | | catwords = word_category.select(selection).execute().fetchall() |
| | 215 | |
| | 216 | catwords = word_category.select(selection).execute() |
| | 217 | |
| 216 | 218 | word_sel = and_(word.c.value == word_value, |
| 217 | | word_category.c.user==self._user) |
| | 219 | word.c.user==self._user) |
| | 220 | |
| 218 | 221 | langs = [the_word.lang_iso for the_word in |
| 219 | | word.select(word_sel).execute().fetchall()] |
| | 222 | word.select(word_sel).execute()] |
| 220 | 223 | |
| 221 | 224 | cats = {} |
| … |
… |
|
| 240 | 243 | """Remove a word""" |
| 241 | 244 | sl = and_(word.c.value==the_word, word.c.user==self._user) |
| 242 | | results = word.select(sl).execute().fetchall() |
| 243 | | |
| 244 | | if len(results) == 0: |
| | 245 | the_word = word.select(sl).execute().fetchone() |
| | 246 | |
| | 247 | if the_word is None: |
| 245 | 248 | return None |
| 246 | | |
| 247 | | the_word = results[0] |
| 248 | 249 | |
| 249 | 250 | # picking up categorized words to work with |
| 250 | 251 | if categories is None: |
| 251 | 252 | sl = word_category.c.word_value == the_word.value |
| 252 | | catwords = word_category.select(sl).execute(user=self._user).fetchall() |
| | 253 | catwords = word_category.select(sl).execute(user=self._user) |
| 253 | 254 | else: |
| 254 | 255 | categories = _tuplify(categories) |
| 255 | | sl = word_category.c.word_id == word.id |
| | 256 | sl = word_category.c.word_value == the_word.value |
| 256 | 257 | sl2 = word_category.c.category_name.in_(*categories) |
| 257 | 258 | sl = and_(sl, sl2) |
| 258 | | catwords = \ |
| 259 | | word_category.select(sl).execute(user=self._user).fetchall() |
| | 259 | catwords = word_category.select(sl).execute(user=self._user) |
| 260 | 260 | |
| 261 | 261 | # remove categorized words |
| … |
… |
|
| 272 | 272 | word_category.delete(sl).execute() |
| 273 | 273 | else: |
| 274 | | rs = word_category.select(sl).execute().fetchall() |
| 275 | | if rs != []: |
| | 274 | rs = word_category.select(sl).execute().fetchone() |
| | 275 | rs_count = rs.count |
| | 276 | rs.close() |
| | 277 | if rs is not None: |
| 276 | 278 | sl2 = word_category.update(sl) |
| 277 | | sl2.execute(count=rs.count-1, user=self._user) |
| | 279 | sl2.execute(count=rs_count-1, user=self._user) |
| 278 | 280 | |
| 279 | 281 | # remove empty categories |
| 280 | 282 | for category in categories: |
| 281 | | sl = and_(word_category.c.category_name == category, |
| | 283 | sl = and_(word_category.c.category_name==category, |
| 282 | 284 | word_category.c.user==self._user) |
| 283 | 285 | |
| 284 | | current_cat = word_category.select(sl).execute().fetchall() |
| 285 | | if len(current_cat) > 0: |
| 286 | | current_cat = current_cat[0] |
| | 286 | current_cat = word_category.select(sl).execute().fetchone() |
| | 287 | if current_cat is not None: |
| 287 | 288 | if current_cat.count == 0: |
| 288 | | word_category.delete(sl).execute() |
| 289 | | else: |
| 290 | | # remove categories |
| 291 | | sl = and_(word_category.c.word_value==the_word.value, |
| 292 | | word_category.c.user==self._user) |
| 293 | | word_category.delete(sl).execute() |
| | 289 | current_cat.close() |
| | 290 | word_category.delete(sl).execute() |
| | 291 | #else: |
| | 292 | # # remove categories |
| | 293 | # sl = and_(word_category.c.word_value==the_word.value, |
| | 294 | # word_category.c.user==self._user) |
| | 295 | # word_category.delete(sl).execute() |
| 294 | 296 | |
| 295 | 297 | # removing word if none use it |
| 296 | 298 | sl = and_(word.c.value==the_word.value, word.c.user==self._user) |
| 297 | | |
| 298 | | #if word.select(sl).execute().fetchall()[0].count == 0: |
| 299 | | word.delete(sl).execute() |
| | 299 | if word.select(sl).execute().fetchone().count == 0: |
| | 300 | word.delete(sl).execute() |
| 300 | 301 | |
| 301 | 302 | def word_count(self, category=None, language=None): |
| 302 | | |
| 303 | 303 | if category is None: |
| 304 | 304 | if language is None: |
| 305 | 305 | sl = word.c.user == self._user |
| 306 | | return len(word.select(sl).execute().fetchall()) |
| | 306 | #return len(word.select(sl).execute().fetchall()) |
| | 307 | return word.select(sl).count().execute().fetchone()[0] |
| 307 | 308 | else: |
| 308 | 309 | sl = and_(word.c.user == self._user, |
| 309 | 310 | word.c.lang_iso == language) |
| 310 | | return len(word.select(sl).execute().fetchall()) |
| | 311 | return word.select(sl).count().execute().fetchone()[0] |
| | 312 | #return len(word.select(sl).execute().fetchall()) |
| 311 | 313 | else: |
| 312 | 314 | categories = _tuplify(category) |
| 313 | 315 | in_ = word_category.c.category_name.in_(*categories) |
| 314 | 316 | sl = and_(in_, word_category.c.user == self._user) |
| 315 | | res = word_category.select(in_).execute() |
| 316 | | return len(res.fetchall()) |
| | 317 | return word_category.select(sl).count().execute().fetchone()[0] |
| | 318 | #res = word_category.select(sl).execute() |
| | 319 | #return len(res.fetchall()) |
| 317 | 320 | |
| 318 | 321 | |
-
|
r157
|
r162
|
|
| 54 | 54 | |
| 55 | 55 | |
| 56 | | current_dir = os.path.dirname(__file__) |
| | 56 | #current_dir = os.path.dirname(__file__) |
| | 57 | current_dir = '.' |
| 57 | 58 | |
| 58 | 59 | def test_suite(): |