| 1 | # -*- coding: utf8 -*- |
|---|
| 2 | # Copyright (c) 2006 Nuxeo SAS <http://nuxeo.com> |
|---|
| 3 | # Authors : Tarek Ziadé <tziade@nuxeo.com> |
|---|
| 4 | # This program is free software; you can redistribute it and/or modify |
|---|
| 5 | # it under the terms of the GNU General Public License version 2 as published |
|---|
| 6 | # by the Free Software Foundation. |
|---|
| 7 | # |
|---|
| 8 | # This program is distributed in the hope that it will be useful, |
|---|
| 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 11 | # GNU General Public License for more details. |
|---|
| 12 | # |
|---|
| 13 | # You should have received a copy of the GNU General Public License |
|---|
| 14 | # along with this program; if not, write to the Free Software |
|---|
| 15 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
|---|
| 16 | # 02111-1307, USA. |
|---|
| 17 | # |
|---|
| 18 | # $Id: filters.py 46887 2006-07-04 10:33:33Z sfermigier $ |
|---|
| 19 | import os |
|---|
| 20 | |
|---|
| 21 | __all__ = ['applyFilter', 'applyFilters', 'AllFilters'] |
|---|
| 22 | |
|---|
| 23 | filters = {} |
|---|
| 24 | |
|---|
| 25 | def registerFilter(filter_object): |
|---|
| 26 | global filters |
|---|
| 27 | filters[filter_object.getName()] = filter_object |
|---|
| 28 | |
|---|
| 29 | def applyFilter(name, text, options): |
|---|
| 30 | return filters[name].transform(text, options) |
|---|
| 31 | |
|---|
| 32 | def applyFilters(names, text, options): |
|---|
| 33 | for name in names: |
|---|
| 34 | text = applyFilter(name, text, options) |
|---|
| 35 | return text |
|---|
| 36 | |
|---|
| 37 | class AllFilters(object): |
|---|
| 38 | |
|---|
| 39 | name = 'allfilters' |
|---|
| 40 | |
|---|
| 41 | def getName(self): |
|---|
| 42 | return self.name |
|---|
| 43 | |
|---|
| 44 | def transform(self, text, options): |
|---|
| 45 | tokenizers = ('normalizer', 'splitter', 'stopwords', 'stemmer') |
|---|
| 46 | return applyFilters(tokenizers, text, options) |
|---|
| 47 | |
|---|
| 48 | |
|---|
| 49 | class BaseFilter(object): |
|---|
| 50 | |
|---|
| 51 | def setInitialState(self, text, options): |
|---|
| 52 | if isinstance(text, list): |
|---|
| 53 | if len(text) > 0: |
|---|
| 54 | self.was_str = isinstance(text[0], str) |
|---|
| 55 | else: |
|---|
| 56 | self.was_str = False |
|---|
| 57 | else: |
|---|
| 58 | self.was_str = isinstance(text, str) |
|---|
| 59 | |
|---|
| 60 | if 'charset' in options: |
|---|
| 61 | self.charset = options['charset'] |
|---|
| 62 | else: |
|---|
| 63 | self.charset = 'utf8' |
|---|
| 64 | |
|---|
| 65 | def getFinalState(self, result): |
|---|
| 66 | if isinstance(result, list): |
|---|
| 67 | return [self.getFinalState(element) for element in result] |
|---|
| 68 | if isinstance(result, unicode) and self.was_str: |
|---|
| 69 | return result.encode(self.charset, "replace") |
|---|
| 70 | elif isinstance(result, str) and not self.was_str: |
|---|
| 71 | return result.decode(self.charset) |
|---|
| 72 | return result |
|---|
| 73 | |
|---|
| 74 | class TextSplitter(BaseFilter): |
|---|
| 75 | |
|---|
| 76 | name = 'splitter' |
|---|
| 77 | |
|---|
| 78 | char_list = ',;:/\'"#?!.-=+_`|()[]{}<>~&§%' |
|---|
| 79 | |
|---|
| 80 | def getName(self): |
|---|
| 81 | return self.name |
|---|
| 82 | |
|---|
| 83 | def _cleanChar(self, char): |
|---|
| 84 | """ XXX at this time, we'll just use a small |
|---|
| 85 | black list we'll see later on adding a real normalizer |
|---|
| 86 | for each language that does all this in one pass |
|---|
| 87 | """ |
|---|
| 88 | if char not in self.char_list: |
|---|
| 89 | return char |
|---|
| 90 | return ' ' |
|---|
| 91 | |
|---|
| 92 | def transform(self, text, options): |
|---|
| 93 | # removing unwanted character |
|---|
| 94 | self.setInitialState(text, options) |
|---|
| 95 | |
|---|
| 96 | try: |
|---|
| 97 | from zopyx.txng3.splitter import Splitter |
|---|
| 98 | zopyx = True |
|---|
| 99 | except ImportError: |
|---|
| 100 | zopyx = False |
|---|
| 101 | |
|---|
| 102 | if zopyx: |
|---|
| 103 | if 'treshold' in options: |
|---|
| 104 | return self.getFinalState([word for word in |
|---|
| 105 | Splitter(singlechar=0).split(text)]) |
|---|
| 106 | else: |
|---|
| 107 | return self.getFinalState([word for word in |
|---|
| 108 | Splitter().split(text)]) |
|---|
| 109 | |
|---|
| 110 | #text = ''.join([self._cleanChar(char) for char in text]) |
|---|
| 111 | text = ' '.join(text) |
|---|
| 112 | |
|---|
| 113 | for char in self.char_list: |
|---|
| 114 | text = text.replace(char, ' ') |
|---|
| 115 | |
|---|
| 116 | result = text.split() |
|---|
| 117 | |
|---|
| 118 | if 'treshold' in options: |
|---|
| 119 | treshold = options['treshold'] |
|---|
| 120 | return self.getFinalState([word.lower() for word in result |
|---|
| 121 | if len(word) >= treshold]) |
|---|
| 122 | else: |
|---|
| 123 | return self.getFinalState([word.lower() for word in result]) |
|---|
| 124 | |
|---|
| 125 | registerFilter(TextSplitter()) |
|---|
| 126 | |
|---|
| 127 | class StopWords(object): |
|---|
| 128 | |
|---|
| 129 | name = 'stopwords' |
|---|
| 130 | treshold = 2 |
|---|
| 131 | |
|---|
| 132 | def getName(self): |
|---|
| 133 | return self.name |
|---|
| 134 | |
|---|
| 135 | def _getStopWords(self, lang=None): |
|---|
| 136 | """ simple text file, but will |
|---|
| 137 | probably move to a DB storage""" |
|---|
| 138 | currentpath = os.path.dirname(__file__) |
|---|
| 139 | basefilename = os.path.join(currentpath, 'stopwords.txt') |
|---|
| 140 | if lang is not None: |
|---|
| 141 | filename = os.path.join(currentpath, 'stopwords.%s.txt' % lang) |
|---|
| 142 | if not os.path.exists(filename): |
|---|
| 143 | filename = basefilename |
|---|
| 144 | else: |
|---|
| 145 | filename = basefilename |
|---|
| 146 | |
|---|
| 147 | return [word.strip() for word in open(filename).readlines() |
|---|
| 148 | if not (word.startswith('#') or word.strip() == '')] |
|---|
| 149 | |
|---|
| 150 | def transform(self, text, options): |
|---|
| 151 | if 'lang' not in options: |
|---|
| 152 | return text |
|---|
| 153 | |
|---|
| 154 | if isinstance(text, unicode) or isinstance(text, str): |
|---|
| 155 | text = text.split() |
|---|
| 156 | |
|---|
| 157 | lang = options['lang'] |
|---|
| 158 | stopwords = self._getStopWords(lang) |
|---|
| 159 | if 'treshold' in options: |
|---|
| 160 | tres = options['treshold'] |
|---|
| 161 | else: |
|---|
| 162 | tres = self.treshold |
|---|
| 163 | |
|---|
| 164 | return [word for word in text if (word not in stopwords |
|---|
| 165 | and len(word) > tres)] |
|---|
| 166 | |
|---|
| 167 | registerFilter(StopWords()) |
|---|
| 168 | |
|---|
| 169 | class Normalizer(BaseFilter): |
|---|
| 170 | |
|---|
| 171 | name = 'normalizer' |
|---|
| 172 | |
|---|
| 173 | def getName(self): |
|---|
| 174 | return self.name |
|---|
| 175 | |
|---|
| 176 | def _getNormalizedChars(self, lang=None): |
|---|
| 177 | """ simple text file, but will |
|---|
| 178 | probably move to a DB storage""" |
|---|
| 179 | currentpath = os.path.dirname(__file__) |
|---|
| 180 | basefilename = os.path.join(currentpath, 'normalized.txt') |
|---|
| 181 | if lang is not None: |
|---|
| 182 | filename = os.path.join(currentpath, 'normalized.%s.txt' % lang) |
|---|
| 183 | if not os.path.exists(filename): |
|---|
| 184 | filename = basefilename |
|---|
| 185 | else: |
|---|
| 186 | filename = basefilename |
|---|
| 187 | |
|---|
| 188 | words = [word.strip() for word in open(filename).readlines() |
|---|
| 189 | if not (word.startswith('#') or word.strip() == '')] |
|---|
| 190 | |
|---|
| 191 | result = {} |
|---|
| 192 | for word in words: |
|---|
| 193 | splited = word.split() |
|---|
| 194 | result[splited[0]] = splited[1] |
|---|
| 195 | return result |
|---|
| 196 | |
|---|
| 197 | def _normalize(self, word, normalizer): |
|---|
| 198 | def normalized(car): |
|---|
| 199 | if car in normalizer: |
|---|
| 200 | return normalizer[car] |
|---|
| 201 | else: |
|---|
| 202 | return car |
|---|
| 203 | |
|---|
| 204 | #normalized = [normalized(car) for car in word] |
|---|
| 205 | for car in normalizer: |
|---|
| 206 | word = word.replace(car, normalizer[car]) |
|---|
| 207 | |
|---|
| 208 | return word |
|---|
| 209 | #''.join(normalized) |
|---|
| 210 | |
|---|
| 211 | def transform(self, text, options): |
|---|
| 212 | self.setInitialState(text, options) |
|---|
| 213 | |
|---|
| 214 | if 'lang' not in options: |
|---|
| 215 | return text |
|---|
| 216 | |
|---|
| 217 | if isinstance(text, unicode) or isinstance(text, str): |
|---|
| 218 | text = text.split() |
|---|
| 219 | |
|---|
| 220 | lang = options['lang'] |
|---|
| 221 | table = self._getNormalizedChars(lang) |
|---|
| 222 | try: |
|---|
| 223 | from zopyx.txng3 import normalizer |
|---|
| 224 | zopyx = True |
|---|
| 225 | except ImportError: |
|---|
| 226 | zopyx = False |
|---|
| 227 | |
|---|
| 228 | if not zopyx: |
|---|
| 229 | result = [self._normalize(word, table) for word in text] |
|---|
| 230 | else: |
|---|
| 231 | result = normalizer.Normalizer(table.items()).normalize(text) |
|---|
| 232 | |
|---|
| 233 | return self.getFinalState(result) |
|---|
| 234 | |
|---|
| 235 | registerFilter(Normalizer()) |
|---|
| 236 | |
|---|
| 237 | class Stemmer(BaseFilter): |
|---|
| 238 | |
|---|
| 239 | name = 'stemmer' |
|---|
| 240 | charset = 'utf8' |
|---|
| 241 | |
|---|
| 242 | def getName(self): |
|---|
| 243 | return self.name |
|---|
| 244 | |
|---|
| 245 | def getStemmerLanguage(self, lang): |
|---|
| 246 | # pystemmer uses its own lang codes |
|---|
| 247 | # XXX get the real ones |
|---|
| 248 | langs = {'dn': 'danish', 'dt':'dutch', 'en': 'english', |
|---|
| 249 | 'fr': 'french', 'de': 'german', 'it': 'italian', |
|---|
| 250 | 'nw': 'norwegian', 'pr': 'porter', |
|---|
| 251 | 'pg': 'portuguese', 'ru': 'russian', 'sp': 'spanish', |
|---|
| 252 | 'sw': 'swedish'} |
|---|
| 253 | if lang in langs: |
|---|
| 254 | return langs[lang] |
|---|
| 255 | return None |
|---|
| 256 | |
|---|
| 257 | def transform(self, text, options): |
|---|
| 258 | self.setInitialState(text, options) |
|---|
| 259 | |
|---|
| 260 | if 'lang' not in options: |
|---|
| 261 | return text |
|---|
| 262 | |
|---|
| 263 | if 'charset' not in options: |
|---|
| 264 | charset = self.charset |
|---|
| 265 | else: |
|---|
| 266 | charset = options['charset'] |
|---|
| 267 | |
|---|
| 268 | if isinstance(text, str) or isinstance(text, unicode): |
|---|
| 269 | text = text.split() |
|---|
| 270 | |
|---|
| 271 | def right_type(result): |
|---|
| 272 | if isinstance(result, unicode) and was_str: |
|---|
| 273 | return result.encode(charset, "replace") |
|---|
| 274 | elif isinstance(result, str) and not was_str: |
|---|
| 275 | return result.decode(charset) |
|---|
| 276 | return result |
|---|
| 277 | |
|---|
| 278 | def checktype(element): |
|---|
| 279 | if isinstance(element, str): |
|---|
| 280 | return element.decode(charset, 'replace') |
|---|
| 281 | return element |
|---|
| 282 | |
|---|
| 283 | text = [checktype(element) for element in text] |
|---|
| 284 | |
|---|
| 285 | try: |
|---|
| 286 | from zopyx.txng3 import stemmer |
|---|
| 287 | except ImportError: |
|---|
| 288 | # module not available |
|---|
| 289 | return self.getFinalState(text) |
|---|
| 290 | |
|---|
| 291 | lang = self.getStemmerLanguage(options['lang']) |
|---|
| 292 | if lang not in stemmer.availableStemmers(): |
|---|
| 293 | return self.getFinalState(text) |
|---|
| 294 | |
|---|
| 295 | stemmer = stemmer.Stemmer(lang) |
|---|
| 296 | return self.getFinalState(stemmer.stem(text)) |
|---|
| 297 | |
|---|
| 298 | registerFilter(Stemmer()) |
|---|