Changeset 102:917b0f7d3576 for atomisator/filters
- Timestamp:
- 05/30/07 16:47:14 (18 months ago)
- Location:
- atomisator/filters
- Files:
-
- 7 modified
-
BayesCore/classifier.py (modified) (2 diffs)
-
BayesCore/data/bayes.db (modified) (previous)
-
BayesCore/storage.py (modified) (1 diff)
-
BayesCore/tokenizer/filters.py (modified) (11 diffs)
-
BayesCore/tokenizer/normalized.fr.txt (modified) (1 diff)
-
BayesCore/tokenizer/normalized.txt (modified) (1 diff)
-
bayes.py (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
-
atomisator/filters/BayesCore/classifier.py
r97 r102 1 # -*- coding: iso-8859-15-*-1 # -*- coding: utf8 -*- 2 2 # Copyright (c) 2006 Nuxeo SAS <http://nuxeo.com> 3 # Authors : Tarek Ziad é<tziade@nuxeo.com>3 # Authors : Tarek Ziadé <tziade@nuxeo.com> 4 4 # This program is free software; you can redistribute it and/or modify 5 5 # it under the terms of the GNU General Public License version 2 as published … … 63 63 # XXX this will be cached 64 64 probabilities = self._buildWordProbabilities() 65 65 66 res = {} 66 67 for category_name in probabilities: -
atomisator/filters/BayesCore/storage.py
r97 r102 25 25 def __init__(self, sqluri): 26 26 self._words, self._catwords = self._get_mapper(sqluri) 27 self._sqluri = sqluri 28 27 29 try: 28 30 self._words.create() -
atomisator/filters/BayesCore/tokenizer/filters.py
r97 r102 1 # -*- coding: iso-8859-15-*-1 # -*- coding: utf8 -*- 2 2 # Copyright (c) 2006 Nuxeo SAS <http://nuxeo.com> 3 # Authors : Tarek Ziad é<tziade@nuxeo.com>3 # Authors : Tarek Ziadé <tziade@nuxeo.com> 4 4 # This program is free software; you can redistribute it and/or modify 5 5 # it under the terms of the GNU General Public License version 2 as published … … 43 43 44 44 def transform(self, text, options): 45 tokenizers = (' splitter', 'stopwords', 'normalizer', 'stemmer')45 tokenizers = ('normalizer', 'splitter', 'stopwords', 'stemmer') 46 46 return applyFilters(tokenizers, text, options) 47 47 … … 61 61 self.charset = options['charset'] 62 62 else: 63 self.charset = ' ISO-8859-15'63 self.charset = 'utf8' 64 64 65 65 def getFinalState(self, result): … … 76 76 name = 'splitter' 77 77 78 char_list = ',;:/\'"#?!.-=+_`|()[]{}<>~& §%'78 char_list = ',;:/\'"#?!.-=+_`|()[]{}<>~&§%' 79 79 80 80 def getName(self): … … 108 108 Splitter().split(text)]) 109 109 110 text = ''.join([self._cleanChar(char) for char in text]) 110 #text = ''.join([self._cleanChar(char) for char in text]) 111 text = ' '.join(text) 112 113 for char in self.char_list: 114 text = text.replace(char, ' ') 111 115 112 116 result = text.split() … … 124 128 125 129 name = 'stopwords' 130 treshold = 2 126 131 127 132 def getName(self): … … 152 157 lang = options['lang'] 153 158 stopwords = self._getStopWords(lang) 154 return [word for word in text if word not in stopwords] 159 return [word for word in text if (word not in stopwords 160 and len(word) > self.treshold)] 155 161 156 162 registerFilter(StopWords()) … … 190 196 else: 191 197 return car 192 normalized = [normalized(car) for car in word] 193 return ''.join(normalized) 198 199 #normalized = [normalized(car) for car in word] 200 for car in normalizer: 201 word = word.replace(car, normalizer[car]) 202 203 return word 204 #''.join(normalized) 194 205 195 206 def transform(self, text, options): … … 222 233 223 234 name = 'stemmer' 224 charset = ' ISO-8859-15'235 charset = 'utf8' 225 236 226 237 def getName(self): … … 262 273 def checktype(element): 263 274 if isinstance(element, str): 264 return element.decode(charset )275 return element.decode(charset, 'replace') 265 276 return element 266 277 … … 281 292 282 293 registerFilter(Stemmer()) 294 -
atomisator/filters/BayesCore/tokenizer/normalized.fr.txt
r97 r102 2 2 # will probably get TextIndexNG normalizer 3 3 # collection next 4 # and be fine-tuned later5 ée6 èe7 àa8 çc9 ùu4 # and be fine-tuned later 5 é e 6 Ú e 7 à a 8 ç c 9 ù u 10 10 # to be completed -
atomisator/filters/BayesCore/tokenizer/normalized.txt
r97 r102 2 2 # will probably get TextIndexNG normalizer 3 3 # collection next 4 # and be fine-tuned later5 ée6 èe7 àa8 çc9 ùu4 # and be fine-tuned later 5 é e 6 Ú e 7 à a 8 ç c 9 ù u 10 10 # to be completed -
atomisator/filters/bayes.py
r99 r102 49 49 SQLURI = 'sqlite:///filters/BayesCore/data/bayes.db' 50 50 51 def bayesian(entry, entries ):51 def bayesian(entry, entries, sqluri=None): 52 52 """uses bayesian inference over entries""" 53 53 content = entry['content'].encode('utf8') … … 59 59 60 60 data = '%s %s' % (content, title) 61 classifier = BayesClassifier(LANG, SQLStorage(SQLURI), AllFilters()) 61 62 if sqluri is None: 63 sqluri = SQLURI 64 65 classifier = BayesClassifier(LANG, SQLStorage(sqluri), AllFilters()) 62 66 63 67 # let's test the entry … … 67 71 return True 68 72 73 69 74 return result[0][0] == 'nojunk' 70 75 71 76 register_filter(bayesian) 72 77 73 def bayesian_learn(entry ):78 def bayesian_learn(entry, sqluri=None, answer=None): 74 79 """uses bayesian inference over entries""" 75 80 content = entry['content'].encode('utf8') … … 80 85 title = '' 81 86 82 print 'title : %s' % title 87 88 if sqluri is None: 89 sqluri = SQLURI 83 90 84 91 data = '%s %s' % (content, title) 85 classifier = BayesClassifier(LANG, SQLStorage(SQLURI), AllFilters())86 92 87 res = raw_input("Interesting (type 'm' for more) ? (y/n) ") 88 res = res.strip().lower() 93 classifier = BayesClassifier(LANG, SQLStorage(sqluri), AllFilters()) 89 94 90 if res == 'm': 91 print content 92 res = raw_input("Interesting ? (y/n) ") 95 if answer is None: 96 print 'title : %s' % title 93 97 98 res = raw_input("Interesting (type 'm' for more) ? (y/n) ") 99 answer = res.strip().lower() 94 100 95 print 'Learning...' 96 if res.strip().lower() in ('y', 'yes'): 101 if answer == 'm': 102 print content 103 answer = raw_input("Interesting ? (y/n) ") 104 105 print 'Learning...' 106 107 if answer.strip().lower() in ('y', 'yes'): 97 108 classifier.learn(data, 'nojunk') 98 109 else:
