python实现支持unicode中文的AC自动机
最近开始从分析数据,要从大量短文本中匹配很多关键字,如果暴力find的话,发现CPU成为了瓶颈,于是想到了AC自动机
AC自动机是多模式匹配的一个经典数据结构,原理是和KMP一样的构造fail指针,不过AC自动机是在Trie树上构造的,但原理是一样的。
为了能够匹配unicode,我讲unicode编码之后,按照每4位进行索引,变成了16叉trie树。
其实这种事情应该用C/C++来写的,不过我不太会使用python调用c,所以就用python改写了
下面是代码
[python] #coding=utf-8
KIND = 16
#BASE = ord('a')
class Node():
def __init__(self):
self.fail = None
self.next = [None]*KIND
self.end = False
class AC_Automachine():
def __init__(self):
self.root = Node()
self.queue = []
def getIndex(self,char):
return ord(char)# - BASE
def insert(self,string):
p = self.root
for char in string:
index = self.getIndex(char)
if p.next[index] == None:
p.next[index] = Node()
p = p.next[index]
p.end = True
def build_automachine(self):
self.root.fail = None
self.queue.append(self.root)
while len(self.queue)!=0:
parent = self.queue[0]
self.queue.pop(0)
for i,child in enumerate(parent.next):
if child == None:continue
if parent == self.root:
child.fail = self.root
else:
failp = parent.fail
while failp != None:
if failp.next[i] != None:
child.fail = failp.next[i]
break
failp = failp.fail
if failp==None: child.fail=self.root
self.queue.append(child)
def matchOne(self,string):
p = self.root
for char in string:
index = self.getIndex(char)
while p.next[index]==None and p!=self.root: p=p.fail
if p.next[index]==None:p=self.root
else: p=p.next[index]
if p.end:return True
return False
class UnicodeAC_AutoMachine():
def __init__(self):
self.ac = AC_Automachine()
def getAcString(self,string):
string = bytearray(string.encode('utf-8'))
ac_string = ''
for byte in string:
ac_string += chr(byte%16)
ac_string += chr(byte/16)
#print ac_string
return ac_string
def insert(self,string):
if type(string) != unicode:
raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
ac_string = self.getAcString(string)
self.ac.insert(ac_string)
def build_automachine(self):
self.ac.build_automachine()
def matchOne(self,string):
if type(string) != unicode:
raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
ac_string = self.getAcString(string)
return self.ac.matchOne(ac_string)
def main2():
ac = UnicodeAC_AutoMachine()
ac.insert(u'丁亚光')
ac.insert(u'好吃的')
ac.insert(u'好玩的')
ac.build_automachine()
print ac.matchOne(u'hi,丁亚光在干啥')
print ac.matchOne(u'ab')
print ac.matchOne(u'不能吃饭啊')
print ac.matchOne(u'饭很好吃,有很多好好的吃的,')
print ac.matchOne(u'有很多好玩的')
if __name__ == '__main__':
main2()
#coding=utf-8
KIND = 16
#BASE = ord('a')
class Node():
def __init__(self):
self.fail = None
self.next = [None]*KIND
self.end = False
class AC_Automachine():
def __init__(self):
self.root = Node()
self.queue = []
def getIndex(self,char):
return ord(char)# - BASE
def insert(self,string):
p = self.root
for char in string:
index = self.getIndex(char)
if p.next[index] == None:
p.next[index] = Node()
p = p.next[index]
p.end = True
def build_automachine(self):
self.root.fail = None
self.queue.append(self.root)
while len(self.queue)!=0:
parent = self.queue[0]
self.queue.pop(0)
for i,child in enumerate(parent.next):
if child == None:continue
if parent == self.root:
child.fail = self.root
else:
failp = parent.fail
while failp != None:
if failp.next[i] != None:
child.fail = failp.next[i]
break
failp = failp.fail
if failp==None: child.fail=self.root
self.queue.append(child)
def matchOne(self,string):
p = self.root
for char in string:
index = self.getIndex(char)
while p.next[index]==None and p!=self.root: p=p.fail
if p.next[index]==None:p=self.root
else: p=p.next[index]
if p.end:return True
return False
class UnicodeAC_AutoMachine():
def __init__(self):
self.ac = AC_Automachine()
def getAcString(self,string):
string = bytearray(string.encode('utf-8'))
ac_string = ''
for byte in string:
ac_string += chr(byte%16)
ac_string += chr(byte/16)
#print ac_string
return ac_string
def insert(self,string):
if type(string) != unicode:
raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
ac_string = self.getAcString(string)
self.ac.insert(ac_string)
def build_automachine(self):
self.ac.build_automachine()
def matchOne(self,string):
if type(string) != unicode:
raise Exception('UnicodeAC_AutoMachine:: insert type not unicode')
ac_string = self.getAcString(string)
return self.ac.matchOne(ac_string)
def main2():
ac = UnicodeAC_AutoMachine()
ac.insert(u'丁亚光')
ac.insert(u'好吃的')
ac.insert(u'好玩的')
ac.build_automachine()
print ac.matchOne(u'hi,丁亚光在干啥')
print ac.matchOne(u'ab')
print ac.matchOne(u'不能吃饭啊')
print ac.matchOne(u'饭很好吃,有很多好好的吃的,')
print ac.matchOne(u'有很多好玩的')
if __name__ == '__main__':
main2()
下面是测试结果
关键词数是2000,长度2-12
文本长度为60-80,分别测试4000条文本和300000条文本的速度
速度分别为 5.5s 和 619.8s
如下图,挺不错的
摘自 New Day New Plan