文本预处理
过滤符号,去掉上标,转换为小写,非英文字符用空格隔开,连续重复字母数大于等于3的只保留1个,去掉指定单词中的空格。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import regex as re import unicodedata def process(text): try : text = re.sub(ur "\p{P}+|\p{Z}+|\p{S}+|\p{N}+" , u ' ' , text) text = unicodedata.normalize( 'NFKD' ,text) #.encode('ascii','ignore') text = re.sub(ur "\p{M}+" , u'', text) text = re.sub(ur "\p{P}+|\p{S}+|\p{N}+|\p{Cs}+|\p{Cf}+|\p{Co}+" , u'', text) text = re.sub( "([A-Za-z]+)" , lambda m:m.group( 1 ).lower(),text) text = re.sub(ur '([^\x00-\x7f])' , lambda m:u ' ' + m.group( 1 ) + u ' ' , text) text = re.sub(ur "(\w)\1{2,}" , lambda m:m.group( 1 ), text) text = re.sub( "(\s+)" , u ' ' ,text) for fword in fword_list: f_re = '' for i in xrange ( len (fword)): w = fword[i] f_re + = w + "+\s*" if i < ( len (fword) - 1 ) else w + "+" text = re.sub(f_re, u ' ' + fword + u ' ' ,text) text = re.sub( "(\s+)" , u ' ' ,text) return text except : return text df[ 'text' ] = df[ 'text' ]. apply ( lambda x: process(x)) |