- Timestamp:
- 05/30/07 16:47:14 (20 months ago)
- Files:
-
- 1 modified
Legend:
- Unmodified
- Added
- Removed
-
atomisator/filters/BayesCore/tokenizer/filters.py
r97 r102 1 # -*- coding: iso-8859-15-*-1 # -*- coding: utf8 -*- 2 2 # Copyright (c) 2006 Nuxeo SAS <http://nuxeo.com> 3 # Authors : Tarek Ziad é<tziade@nuxeo.com>3 # Authors : Tarek Ziadé <tziade@nuxeo.com> 4 4 # This program is free software; you can redistribute it and/or modify 5 5 # it under the terms of the GNU General Public License version 2 as published … … 43 43 44 44 def transform(self, text, options): 45 tokenizers = (' splitter', 'stopwords', 'normalizer', 'stemmer')45 tokenizers = ('normalizer', 'splitter', 'stopwords', 'stemmer') 46 46 return applyFilters(tokenizers, text, options) 47 47 … … 61 61 self.charset = options['charset'] 62 62 else: 63 self.charset = ' ISO-8859-15'63 self.charset = 'utf8' 64 64 65 65 def getFinalState(self, result): … … 76 76 name = 'splitter' 77 77 78 char_list = ',;:/\'"#?!.-=+_`|()[]{}<>~& §%'78 char_list = ',;:/\'"#?!.-=+_`|()[]{}<>~&§%' 79 79 80 80 def getName(self): … … 108 108 Splitter().split(text)]) 109 109 110 text = ''.join([self._cleanChar(char) for char in text]) 110 #text = ''.join([self._cleanChar(char) for char in text]) 111 text = ' '.join(text) 112 113 for char in self.char_list: 114 text = text.replace(char, ' ') 111 115 112 116 result = text.split() … … 124 128 125 129 name = 'stopwords' 130 treshold = 2 126 131 127 132 def getName(self): … … 152 157 lang = options['lang'] 153 158 stopwords = self._getStopWords(lang) 154 return [word for word in text if word not in stopwords] 159 return [word for word in text if (word not in stopwords 160 and len(word) > self.treshold)] 155 161 156 162 registerFilter(StopWords()) … … 190 196 else: 191 197 return car 192 normalized = [normalized(car) for car in word] 193 return ''.join(normalized) 198 199 #normalized = [normalized(car) for car in word] 200 for car in normalizer: 201 word = word.replace(car, normalizer[car]) 202 203 return word 204 #''.join(normalized) 194 205 195 206 def transform(self, text, options): … … 222 233 223 234 name = 'stemmer' 224 charset = ' ISO-8859-15'235 charset = 'utf8' 225 236 226 237 def getName(self): … … 262 273 def checktype(element): 263 274 if isinstance(element, str): 264 return element.decode(charset )275 return element.decode(charset, 'replace') 265 276 return element 266 277 … … 281 292 282 293 registerFilter(Stemmer()) 294
