UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 6: ordinal not in range(128)
帮助我找出我的python代码有什么问题。
这就是密码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | import nltk import re import pickle raw = open('tom_sawyer_shrt.txt').read() ### this is how the basic Punkt sentence tokenizer works #sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') #sents = sent_tokenizer.tokenize(raw) ### train & tokenize text using text sent_trainer = nltk.tokenize.punkt.PunktSentenceTokenizer().train(raw) sent_tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(sent_trainer) # break in to sentences sents = sent_tokenizer.tokenize(raw) # get sentence start/stop indexes sentspan = sent_tokenizer.span_tokenize(raw) ### Remove in the middle of setences, due to fixed-width formatting for i in range(0,len(sents)-1): sents[i] = re.sub('(?<! ) (?! )',' ',raw[sentspan[i][0]:sentspan[i+1][0]]) for i in range(1,len(sents)): if (sents[i][0:3] == '" '): sents[i-1] = sents[i-1]+'" ' sents[i] = sents[i][3:] ### Loop thru each sentence, fix to 140char i=0 tweet=[] while (i<len(sents)): if (len(sents[i]) > 140): ntwt = int(len(sents[i])/140) + 1 words = sents[i].split(' ') nwords = len(words) for k in range(0,ntwt): tweet = tweet + [ re.sub('\A\s|\s\Z', '', ' '.join( words[int(k*nwords/float(ntwt)): int((k+1)*nwords/float(ntwt))] ))] i=i+1 else: if (i<len(sents)-1): if (len(sents[i])+len(sents[i+1]) <140): nextra = 1 while (len(''.join(sents[i:i+nextra+1]))<140): nextra=nextra+1 tweet = tweet+[ re.sub('\A\s|\s\Z', '',''.join(sents[i:i+nextra])) ] i = i+nextra else: tweet = tweet+[re.sub('\A\s|\s\Z', '',sents[i])] i=i+1 else: tweet = tweet+[re.sub('\A\s|\s\Z', '',sents[i])] i=i+1 ### A last pass to clean up leading/trailing newlines/spaces. for i in range(0,len(tweet)): tweet[i] = re.sub('\A\s|\s\Z','',tweet[i]) for i in range(0,len(tweet)): tweet[i] = re.sub('\A" ','',tweet[i]) ### Save tweets to pickle file for easy reading later output = open('tweet_list.pkl','wb') pickle.dump(tweet,output,-1) output.close() listout = open('tweet_lis.txt','w') for i in range(0,len(tweet)): listout.write(tweet[i]) listout.write(' ----------------- ') listout.close() |
这就是错误信息
Traceback (most recent call last):
File"twain_prep.py", line 13, in
sent_trainer = nltk.tokenize.punkt.PunktSentenceTokenizer().train(raw)
File"/home/user/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 1227, in train
token_cls=self._Token).get_params()
File"/home/user/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 649, in init
self.train(train_text, verbose, finalize=True)
File"/home/user/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 713, in train
self._train_tokens(self._tokenize_words(text), verbose)
File"/home/user/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 729, in _train_tokens
tokens = list(tokens)
File"/home/user/.local/lib/python2.7/site-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('
'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 6: ordinal not in range(128)
当字符串中包含一些Unicode时,就会发生
那么如何修复呢?
您可以将文本转换为
1 | raw = raw..encode('ascii', 'ignore') |
另外,您可以阅读本文来处理