Changeset 162:ca1529c62df9 for classifier/classifier.py
- Timestamp:
- 08/21/07 19:12:16 (11 months ago)
- Files:
-
- 1 modified
-
classifier/classifier.py (modified) (6 diffs)
Legend:
- Unmodified
- Added
- Removed
-
classifier/classifier.py
r159 r162 31 31 self.backend = backend 32 32 self.tokenizer = tokenizer 33 self._learnt = True 34 self._probs = None 35 33 36 if options is None: 34 37 self.options = {'lang': self.language} … … 42 45 wich means: store words in categories 43 46 """ 47 self._learnt = True 44 48 self.backend.add_category(name=category) 45 49 data = self.tokenizer.transform(data, self.options) … … 52 56 wich means: remove words from categories 53 57 """ 58 self._learnt = True 54 59 data = self.tokenizer.transform(data, self.options) 55 60 for element in data: 56 self.backend.del Word(element, category)61 self.backend.del_word(element, category) 57 62 58 63 def guess(self, data): … … 62 67 63 68 # XXX this will be cached 64 probabilities = self._buildWordProbabilities() 69 if self._learnt: 70 self._probs = self._buildWordProbabilities() 71 else: 72 if self._probs is None: 73 self._probs = self._buildWordProbabilities() 74 75 probabilities = self._probs 76 self._learnt = False 65 77 66 78 res = {} … … 124 136 def _buildWordProbabilities(self, language=None): 125 137 probs = {} 138 corpus_size = self.corpusSize(language) 139 words = list(self.backend.list_words(language, complete=True)) 126 140 for cat in self.backend.list_categories(): 127 probs[cat] = self._buildCategoryWordProbabilities(cat, language) 141 142 probs[cat] = self._buildCategoryWordProbabilities(cat, language, 143 corpus_size, words) 128 144 return probs 129 145 130 def _buildCategoryWordProbabilities(self, category, language=None): 146 def _buildCategoryWordProbabilities(self, category, language=None, 147 corpus_size=None, words=None): 131 148 """Merges corpora and computes probabilities 132 149 133 150 XXX to be cached later (invalidation on word adding) 134 151 """ 152 if corpus_size is None: 153 corpus_size = self.corpusSize(language) 154 if words is None: 155 words = self.backend.list_words(language, complete=True) 135 156 if language is None: 136 157 language = self.language 137 corpus_size = self.corpusSize(language) 158 138 159 category_size = float(self.categorySize(category, language)) 139 160 them_count = float(max(corpus_size - category_size, 1)) 140 161 probabilities = {} 141 words = self.backend.list_words(language, complete=True)142 162 143 163 for word in words: 144 if category not in word[1][1].keys(): 164 the_word = word[1][1] 165 166 if category not in the_word.keys(): 145 167 continue 146 168 … … 149 171 continue 150 172 151 cat_word_count = float( word[1][1][category])152 other_count = cat_word_count -word_count173 cat_word_count = float(the_word[category]) 174 other_count = word_count - cat_word_count 153 175 154 176 if category_size == 0:
