| 1 | # This program is free software; you can redistribute it and/or modify it under |
|---|
| 2 | # the terms of the GNU General Public License as published by the Free Software |
|---|
| 3 | # Foundation; either version 2 of the License, or (at your option) any later |
|---|
| 4 | # version. |
|---|
| 5 | # |
|---|
| 6 | # This program is distributed in the hope that it will be useful, but WITHOUT |
|---|
| 7 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
|---|
| 8 | # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
|---|
| 9 | # |
|---|
| 10 | # You should have received a copy of the GNU General Public License along with |
|---|
| 11 | # this program; if not, write to the Free Software Foundation, Inc., |
|---|
| 12 | # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
|---|
| 13 | """ Copyright (c) 2002-2003 LOGILAB S.A. (Paris, FRANCE). |
|---|
| 14 | http://www.logilab.fr/ -- mailto:contact@logilab.fr |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | a Python implementation of PATRICIA tree |
|---|
| 18 | |
|---|
| 19 | PATRICIA - Practical Algorithm to Retrieve Information Coded in Alphanumeric |
|---|
| 20 | D.R.Morrison (1968). |
|---|
| 21 | See http://www.csse.monash.edu.au/~lloyd/tildeAlgDS/Tree/PATRICIA.html if you |
|---|
| 22 | want to know what's a PATRICIA tree... |
|---|
| 23 | |
|---|
| 24 | TODO: _ advanced search |
|---|
| 25 | _ profile code |
|---|
| 26 | _ use mxTextTools ? |
|---|
| 27 | """ |
|---|
| 28 | |
|---|
| 29 | from warnings import warn |
|---|
| 30 | warn('this module is deprecated and will disappear in a near release', |
|---|
| 31 | DeprecationWarning, stacklevel=1) |
|---|
| 32 | |
|---|
| 33 | __revision__ = "$Id: patricia.py,v 1.5 2003-10-31 14:18:32 syt Exp $" |
|---|
| 34 | |
|---|
| 35 | def prefix(prfx, string): |
|---|
| 36 | """return the index of the first character from string which differs from |
|---|
| 37 | prefix |
|---|
| 38 | """ |
|---|
| 39 | i = 0 |
|---|
| 40 | while i < len(prfx): |
|---|
| 41 | if i == len(string) or prfx[i] != string[i]: |
|---|
| 42 | break |
|---|
| 43 | i += 1 |
|---|
| 44 | return i |
|---|
| 45 | |
|---|
| 46 | def split(index, string): |
|---|
| 47 | """split a string on index, returning a 3-uple : |
|---|
| 48 | (string before index, character at index, string after index) |
|---|
| 49 | """ |
|---|
| 50 | return string[:index], string[index], string[index+1:] |
|---|
| 51 | |
|---|
| 52 | |
|---|
| 53 | class PatriciaNode: |
|---|
| 54 | """a PATRICIA trie node |
|---|
| 55 | """ |
|---|
| 56 | |
|---|
| 57 | def __init__(self, value='', leaf=0, data=None): |
|---|
| 58 | self.value = value |
|---|
| 59 | self.edges = {} |
|---|
| 60 | if leaf: |
|---|
| 61 | self.datas = [data] |
|---|
| 62 | else: |
|---|
| 63 | self.datas = [] |
|---|
| 64 | |
|---|
| 65 | def insert(self, string, data): |
|---|
| 66 | """ insert the string in the trie and associate data to it |
|---|
| 67 | if the string exists is the trie, data is added to the existing datas |
|---|
| 68 | """ |
|---|
| 69 | # are we arrived ? |
|---|
| 70 | if self.value == string: |
|---|
| 71 | self.datas.append(data) |
|---|
| 72 | # not yet ! |
|---|
| 73 | else: |
|---|
| 74 | # check we don't break compression (value don't match) |
|---|
| 75 | ind = prefix(self.value, string) |
|---|
| 76 | if ind < len(self.value): |
|---|
| 77 | # split this node |
|---|
| 78 | pfx, e, self.value = split(ind, self.value) |
|---|
| 79 | if ind < len(string): |
|---|
| 80 | n = PatriciaNode(pfx) |
|---|
| 81 | n.edges[string[ind]] = PatriciaNode(string[ind+1:], 1, data) |
|---|
| 82 | else: |
|---|
| 83 | n = PatriciaNode(pfx, 1, data) |
|---|
| 84 | n.edges[e] = self |
|---|
| 85 | return n |
|---|
| 86 | n_pfx, n_e, n_sfx = split(len(self.value), string) |
|---|
| 87 | if self.edges.has_key(n_e): |
|---|
| 88 | self.edges[n_e] = self.edges[n_e].insert(n_sfx, data) |
|---|
| 89 | else: |
|---|
| 90 | self.edges[n_e] = PatriciaNode(n_sfx, 1, data) |
|---|
| 91 | return self |
|---|
| 92 | |
|---|
| 93 | def remove(self, string): |
|---|
| 94 | """ return datas associated with string and remove string from the trie |
|---|
| 95 | raise KeyError if the key isn't found |
|---|
| 96 | FIXME: we should change the trie structure |
|---|
| 97 | """ |
|---|
| 98 | if string == self.value and self.datas: |
|---|
| 99 | datas = self.datas |
|---|
| 100 | self.datas = [] |
|---|
| 101 | return datas |
|---|
| 102 | else: |
|---|
| 103 | pfx, e, sfx = split(len(self.value), string) |
|---|
| 104 | if self.value == pfx: |
|---|
| 105 | return self.edges[e].remove(sfx) |
|---|
| 106 | raise KeyError(string) |
|---|
| 107 | |
|---|
| 108 | def lookup(self, string): |
|---|
| 109 | """ return datas associated with string |
|---|
| 110 | raise KeyError if the key isn't found |
|---|
| 111 | """ |
|---|
| 112 | if string == self.value: |
|---|
| 113 | if self.datas: |
|---|
| 114 | return self.datas |
|---|
| 115 | raise KeyError(string) |
|---|
| 116 | else: # len(self.value) < len(string): |
|---|
| 117 | pfx, e, sfx = split(len(self.value), string) |
|---|
| 118 | if self.value == pfx: |
|---|
| 119 | return self.edges[e].lookup(sfx) |
|---|
| 120 | raise KeyError(string) |
|---|
| 121 | |
|---|
| 122 | def pfx_search(self, pfx, depth=-1): |
|---|
| 123 | """ return all string with prefix pfx """ |
|---|
| 124 | sfxs = [] |
|---|
| 125 | if pfx and self.value[:len(pfx)] != pfx: |
|---|
| 126 | pfx, e, sfx = split(len(self.value), pfx) |
|---|
| 127 | if self.value == pfx and self.edges.has_key(e): |
|---|
| 128 | sfxs = ['%s%s%s' % (self.value, e, sfx) |
|---|
| 129 | for sfx in self.edges[e].pfx_search(sfx, depth)] |
|---|
| 130 | else: |
|---|
| 131 | if depth != 0: |
|---|
| 132 | for e, child in self.edges.items(): |
|---|
| 133 | search = child.pfx_search('', depth-1-len(self.value)) |
|---|
| 134 | sfxs += ['%s%s%s' % (self.value, e, sfx) |
|---|
| 135 | for sfx in search] |
|---|
| 136 | if (depth < 0 or len(self.value) <= depth): |
|---|
| 137 | if self.datas: |
|---|
| 138 | sfxs.append(self.value) |
|---|
| 139 | return sfxs |
|---|
| 140 | |
|---|
| 141 | def __str__(self, indent=''): |
|---|
| 142 | node_str = ''.join([' %s%s:\n%s' % (indent, key, |
|---|
| 143 | a.__str__(' %s' % indent)) |
|---|
| 144 | for key, a in self.edges.items()]) |
|---|
| 145 | return '%s%s, %s\n%s' % (indent, self.value, self.datas, node_str) |
|---|
| 146 | |
|---|
| 147 | def __repr__(self): |
|---|
| 148 | return '<PatriciaNode id=%s value=%s childs=%s datas=%s>' % ( |
|---|
| 149 | id(self), self.value, self.edges.keys(), self.datas) |
|---|
| 150 | |
|---|
| 151 | |
|---|
| 152 | class PatriciaTrie: |
|---|
| 153 | """ wrapper class for a patricia tree |
|---|
| 154 | delegates to the root of the tree (PatriciaNode) |
|---|
| 155 | """ |
|---|
| 156 | |
|---|
| 157 | def __init__(self): |
|---|
| 158 | self._trie = None |
|---|
| 159 | self.words = 0 |
|---|
| 160 | |
|---|
| 161 | def insert(self, string, data=None): |
|---|
| 162 | """ insert a string into the tree """ |
|---|
| 163 | self.words += 1 |
|---|
| 164 | if self._trie is None: |
|---|
| 165 | self._trie = PatriciaNode(string, 1, data) |
|---|
| 166 | else: |
|---|
| 167 | self._trie = self._trie.insert(string, data) |
|---|
| 168 | |
|---|
| 169 | def remove(self, string): |
|---|
| 170 | """ remove a string from the tree """ |
|---|
| 171 | if self._trie is not None: |
|---|
| 172 | return self._trie.remove(string) |
|---|
| 173 | raise KeyError(string) |
|---|
| 174 | |
|---|
| 175 | def lookup(self, string): |
|---|
| 176 | """ look for a string into the tree """ |
|---|
| 177 | if self._trie is not None: |
|---|
| 178 | return self._trie.lookup(string) |
|---|
| 179 | raise KeyError(string) |
|---|
| 180 | |
|---|
| 181 | def pfx_search(self, string, depth=-1): |
|---|
| 182 | """ search all words begining by <string> """ |
|---|
| 183 | if self._trie is not None: |
|---|
| 184 | return self._trie.pfx_search(string, depth) |
|---|
| 185 | raise KeyError(string) |
|---|
| 186 | |
|---|
| 187 | def __str__(self): |
|---|
| 188 | return self._trie.__str__() |
|---|
| 189 | |
|---|
| 190 | def __repr__(self): |
|---|
| 191 | return '<PatriciaTrie id=%s words=%s>' % (id(self), self.words) |
|---|