Changeset 162:ca1529c62df9 for classifier
- Timestamp:
- 08/21/07 19:12:16 (12 months ago)
- Location:
- classifier
- Files:
-
- 5 modified
-
classifier.py (modified) (6 diffs)
-
doc/classifier.txt (modified) (5 diffs)
-
doc/storage.txt (modified) (2 diffs)
-
storage.py (modified) (11 diffs)
-
tests/test_docs.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
classifier/classifier.py
r159 r162 31 31 self.backend = backend 32 32 self.tokenizer = tokenizer 33 self._learnt = True 34 self._probs = None 35 33 36 if options is None: 34 37 self.options = {'lang': self.language} … … 42 45 wich means: store words in categories 43 46 """ 47 self._learnt = True 44 48 self.backend.add_category(name=category) 45 49 data = self.tokenizer.transform(data, self.options) … … 52 56 wich means: remove words from categories 53 57 """ 58 self._learnt = True 54 59 data = self.tokenizer.transform(data, self.options) 55 60 for element in data: 56 self.backend.del Word(element, category)61 self.backend.del_word(element, category) 57 62 58 63 def guess(self, data): … … 62 67 63 68 # XXX this will be cached 64 probabilities = self._buildWordProbabilities() 69 if self._learnt: 70 self._probs = self._buildWordProbabilities() 71 else: 72 if self._probs is None: 73 self._probs = self._buildWordProbabilities() 74 75 probabilities = self._probs 76 self._learnt = False 65 77 66 78 res = {} … … 124 136 def _buildWordProbabilities(self, language=None): 125 137 probs = {} 138 corpus_size = self.corpusSize(language) 139 words = list(self.backend.list_words(language, complete=True)) 126 140 for cat in self.backend.list_categories(): 127 probs[cat] = self._buildCategoryWordProbabilities(cat, language) 141 142 probs[cat] = self._buildCategoryWordProbabilities(cat, language, 143 corpus_size, words) 128 144 return probs 129 145 130 def _buildCategoryWordProbabilities(self, category, language=None): 146 def _buildCategoryWordProbabilities(self, category, language=None, 147 corpus_size=None, words=None): 131 148 """Merges corpora and computes probabilities 132 149 133 150 XXX to be cached later (invalidation on word adding) 134 151 """ 152 if corpus_size is None: 153 corpus_size = self.corpusSize(language) 154 if words is None: 155 words = self.backend.list_words(language, complete=True) 135 156 if language is None: 136 157 language = self.language 137 corpus_size = self.corpusSize(language) 158 138 159 category_size = float(self.categorySize(category, language)) 139 160 them_count = float(max(corpus_size - category_size, 1)) 140 161 probabilities = {} 141 words = self.backend.list_words(language, complete=True)142 162 143 163 for word in words: 144 if category not in word[1][1].keys(): 164 the_word = word[1][1] 165 166 if category not in the_word.keys(): 145 167 continue 146 168 … … 149 171 continue 150 172 151 cat_word_count = float( word[1][1][category])152 other_count = cat_word_count -word_count173 cat_word_count = float(the_word[category]) 174 other_count = word_count - cat_word_count 153 175 154 176 if category_size == 0: -
classifier/doc/classifier.txt
r160 r162 23 23 >>> tokenizer = AllFilters() 24 24 >>> classifier = BayesClassifier('fr', backend, tokenizer) 25 26 Le backend doit être vide au début:: 27 28 >>> backend.word_count() 29 0 25 30 26 31 Le classificateur fait deux choses: apprendre et deviner, pour une langue … … 59 64 [(u'achetez', 0.99...), (u'kimouss', 0.99...)] 60 65 61 >>> classifier.learn('savon ki mouss par-ci, savon par-la, savon toujours',66 >>> classifier.learn('savon kipouss par-ci, savon par-la, savon toujours', 62 67 ... 'song') 63 68 >>> sorted(classifier._buildCategoryWordProbabilities('spam').items()) #2 64 [(u'achetez', 0.99...), (u'kimouss', 0.99...), (u'savon', 0.0001)] 69 [(u'achetez', 0.99...), (u'kimouss', 0.99...), (u'savon', 0.14...)] 70 71 >>> classifier.categorySize('song') 72 4 65 73 66 74 >>> sorted(classifier._buildCategoryWordProbabilities('song').items()) 67 [(u'ki mouss', 0.0001), (u'par', 0.99...), (u'savon', 0.99...), (u'toujours', 0.99...)]75 [(u'kipouss', 0.99...), (u'par', 0.99...), (u'toujours', 0.99...)] 68 76 69 77 Ce calcul est fait pour toutes les catégories:: … … 73 81 74 82 75 La reconnaissance se base sur ce filtrage de mots, puis appl que l'algo de83 La reconnaissance se base sur ce filtrage de mots, puis applique l'algo de 76 84 Robinson-fisher:: 77 85 78 86 >>> classifier.guess('achetez mon savon KIPOUSS') 79 [(u'song', 0.99...), (u'spam', 0. 5), ...]87 [(u'song', 0.99...), (u'spam', 0.70...), (u'friend', 0.16...)] 80 88 81 89 We lower default treshold first:: … … 126 134 >>> source2 = open(file).read() 127 135 >>> classifier.guess(source2) 128 [(u'python', 1.0), (u'doctest', ...e-...)]136 [(u'python', ...), ...] 129 137 130 138 Le classificateur doit aussi savoir `désapprendre`:: … … 133 141 >>> classifier.unlearn(source2, 'python') 134 142 >>> classifier.guess(source2) 135 [(u'doctest', 1.0)]143 [(u'doctest', ...), ...] 136 144 137 145 -
classifier/doc/storage.txt
r157 r162 10 10 >>> import settings 11 11 >>> settings.SQLURI = 'sqlite:///%s' % db_file 12 >>> import os 13 >>> if os.path.exists(db_file): 14 ... os.remove(db_file) 15 12 16 13 17 It works with languages, words, categories, and words within categories:: … … 18 22 bayesian data:: 19 23 20 >>> storage = SQLStorage('tarek') 21 24 >>> storage = SQLStorage('tester') 22 25 23 26 Next we can store languages, since each word is in a given language:: -
classifier/storage.py
r159 r162 25 25 lang = Table('classifier_lang', _metadata, 26 26 Column('iso', String(3), primary_key=True), 27 Column('user', String(40) ))27 Column('user', String(40), primary_key=True)) 28 28 29 29 word = Table('classifier_word', _metadata, … … 31 31 Column('lang_iso', String(3), ForeignKey('classifier_lang.iso')), 32 32 Column('count', Integer), 33 Column('user', String(40) )33 Column('user', String(40), primary_key=True) 34 34 ) 35 35 … … 37 37 Column('name', String(50), primary_key=True), 38 38 Column('description', String(300)), 39 Column('user', String(40) ))39 Column('user', String(40), primary_key=True)) 40 40 41 41 42 42 word_category = Table('classifier_word_category', _metadata, 43 Column('id', Integer, primary_key=True),43 #Column('id', Integer, primary_key=True), 44 44 Column('word_value', String(200), 45 ForeignKey('classifier_word.value') ),45 ForeignKey('classifier_word.value'), primary_key=True), 46 46 Column('count', Integer), 47 47 Column('category_name', String(50), 48 ForeignKey('classifier_category.name') ),49 Column('user', String(40) ))48 ForeignKey('classifier_category.name'), primary_key=True), 49 Column('user', String(40), primary_key=True)) 50 50 51 51 … … 151 151 sel = word.select(and_(word.c.value==the_word, 152 152 word.c.user==self._user)) 153 selection = sel.execute().fetch all()153 selection = sel.execute().fetchone() 154 154 cat_insert = word_category.insert() 155 155 156 157 if len(selection) == 0: 156 if selection is None: 158 157 # new word, let's add it 159 158 res = word.insert().execute(value=the_word, count=1, … … 169 168 res = word.update(and_(word.c.value==the_word, 170 169 word.c.user==self._user)) 171 res.execute(count=selection [0].count+1)172 173 sl = and_(word_category.c.word_value==the_word,174 word_category.c.user==self._user)175 cat = word_category.select(sl)176 cat = cat.execute().fetchall()170 res.execute(count=selection.count+1) 171 172 #sl = and_(word_category.c.word_value==the_word, 173 # word_category.c.user==self._user) 174 #cat = word_category.select(sl) 175 #cat = cat.execute().fetchall() 177 176 178 177 #for old_cat in cat: … … 185 184 # word_category.delete(sl).execute() 186 185 186 existing_cats = self.list_categories() 187 187 for category in categories: 188 188 # let's check if the category exists in category 189 if category not in self.list_categories():189 if category not in existing_cats: 190 190 self.add_category(category) 191 191 … … 194 194 word_category.c.word_value==the_word) 195 195 cat = word_category.select(sl) 196 cat = cat.execute().fetchall() 197 198 if cat == []:196 197 cat = cat.execute().fetchone() 198 if cat is None: 199 199 cat_insert.execute(word_value=the_word, 200 200 category_name=category, … … 202 202 203 203 else: 204 cat_count = cat[0].count205 id_ = cat[0].id206 up = word_category.update(word_category.c.id==id_)207 up.execute(count=cat _count+1)204 up = word_category.update(and_(word_category.c.user==cat.user, 205 word_category.c.word_value==cat.word_value, 206 word_category.c.category_name==cat.category_name)) 207 up.execute(count=cat.count+1) 208 208 209 209 … … 213 213 selection = and_(word_category.c.word_value == word_value, 214 214 word_category.c.user==self._user) 215 catwords = word_category.select(selection).execute().fetchall() 215 216 catwords = word_category.select(selection).execute() 217 216 218 word_sel = and_(word.c.value == word_value, 217 word_category.c.user==self._user) 219 word.c.user==self._user) 220 218 221 langs = [the_word.lang_iso for the_word in 219 word.select(word_sel).execute() .fetchall()]222 word.select(word_sel).execute()] 220 223 221 224 cats = {} … … 240 243 """Remove a word""" 241 244 sl = and_(word.c.value==the_word, word.c.user==self._user) 242 results = word.select(sl).execute().fetchall()243 244 if len(results) == 0:245 the_word = word.select(sl).execute().fetchone() 246 247 if the_word is None: 245 248 return None 246 247 the_word = results[0]248 249 249 250 # picking up categorized words to work with 250 251 if categories is None: 251 252 sl = word_category.c.word_value == the_word.value 252 catwords = word_category.select(sl).execute(user=self._user) .fetchall()253 catwords = word_category.select(sl).execute(user=self._user) 253 254 else: 254 255 categories = _tuplify(categories) 255 sl = word_category.c.word_ id == word.id256 sl = word_category.c.word_value == the_word.value 256 257 sl2 = word_category.c.category_name.in_(*categories) 257 258 sl = and_(sl, sl2) 258 catwords = \ 259 word_category.select(sl).execute(user=self._user).fetchall() 259 catwords = word_category.select(sl).execute(user=self._user) 260 260 261 261 # remove categorized words … … 272 272 word_category.delete(sl).execute() 273 273 else: 274 rs = word_category.select(sl).execute().fetchall() 275 if rs != []: 274 rs = word_category.select(sl).execute().fetchone() 275 rs_count = rs.count 276 rs.close() 277 if rs is not None: 276 278 sl2 = word_category.update(sl) 277 sl2.execute(count=rs .count-1, user=self._user)279 sl2.execute(count=rs_count-1, user=self._user) 278 280 279 281 # remove empty categories 280 282 for category in categories: 281 sl = and_(word_category.c.category_name ==category,283 sl = and_(word_category.c.category_name==category, 282 284 word_category.c.user==self._user) 283 285 284 current_cat = word_category.select(sl).execute().fetchall() 285 if len(current_cat) > 0: 286 current_cat = current_cat[0] 286 current_cat = word_category.select(sl).execute().fetchone() 287 if current_cat is not None: 287 288 if current_cat.count == 0: 288 word_category.delete(sl).execute() 289 else: 290 # remove categories 291 sl = and_(word_category.c.word_value==the_word.value, 292 word_category.c.user==self._user) 293 word_category.delete(sl).execute() 289 current_cat.close() 290 word_category.delete(sl).execute() 291 #else: 292 # # remove categories 293 # sl = and_(word_category.c.word_value==the_word.value, 294 # word_category.c.user==self._user) 295 # word_category.delete(sl).execute() 294 296 295 297 # removing word if none use it 296 298 sl = and_(word.c.value==the_word.value, word.c.user==self._user) 297 298 #if word.select(sl).execute().fetchall()[0].count == 0: 299 word.delete(sl).execute() 299 if word.select(sl).execute().fetchone().count == 0: 300 word.delete(sl).execute() 300 301 301 302 def word_count(self, category=None, language=None): 302 303 303 if category is None: 304 304 if language is None: 305 305 sl = word.c.user == self._user 306 return len(word.select(sl).execute().fetchall()) 306 #return len(word.select(sl).execute().fetchall()) 307 return word.select(sl).count().execute().fetchone()[0] 307 308 else: 308 309 sl = and_(word.c.user == self._user, 309 310 word.c.lang_iso == language) 310 return len(word.select(sl).execute().fetchall()) 311 return word.select(sl).count().execute().fetchone()[0] 312 #return len(word.select(sl).execute().fetchall()) 311 313 else: 312 314 categories = _tuplify(category) 313 315 in_ = word_category.c.category_name.in_(*categories) 314 316 sl = and_(in_, word_category.c.user == self._user) 315 res = word_category.select(in_).execute() 316 return len(res.fetchall()) 317 return word_category.select(sl).count().execute().fetchone()[0] 318 #res = word_category.select(sl).execute() 319 #return len(res.fetchall()) 317 320 318 321 -
classifier/tests/test_docs.py
r157 r162 54 54 55 55 56 current_dir = os.path.dirname(__file__) 56 #current_dir = os.path.dirname(__file__) 57 current_dir = '.' 57 58 58 59 def test_suite():
