Memory error using cv.fit_transform(corpus).toarray()
如果有人能帮助cv.fit_Transform(corpus.toarray()处理大小约为732066x<140(tweets)的语料库,我将不胜感激。文本已被清除,以减少功能和维度,但我一直得到下面的错误
我就是这样开始的
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset cols = ["text","geocoordinates0","geocoordinates1","grid"] dataset = pd.read_csv('tweets.tsv', delimiter = '\t', usecols=cols, quoting = 3, error_bad_lines=False, low_memory=False) # Removing Non-ASCII characters def remove_non_ascii_1(dataset): return ''.join([i if ord(i) < 128 else ' ' for i in dataset]) # Cleaning the texts import re import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer corpus = [] for i in range(0, 732066): review = re.sub('[^a-zA-Z]', ' ', str(dataset['text'][i])) review = review.lower() review = review.split() ps = PorterStemmer() review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 3].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) accuracies.mean() accuracies.std() |
下面是输出错误:
X = cv.fit_transform(corpus).toarray() Traceback (most recent call
last):File"", line 1, in
X = cv.fit_transform(corpus).toarray()File
"C:\Anaconda3\envs\py35\lib\site-packages\scipy\sparse\compressed.py",
line 920, in toarray
return self.tocoo(copy=False).toarray(order=order, out=out)File
"C:\Anaconda3\envs\py35\lib\site-packages\scipy\sparse\coo.py",
line 252, in toarray
B = self._process_toarray_args(order, out)File
"C:\Anaconda3\envs\py35\lib\site-packages\scipy\sparse\base.py",
line 1009, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)MemoryError
多谢了!
ps:删除arraylist并使用@kumar建议的multinomianb后,现在出现以下错误:
1 2 3 | from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() classifier.fit(X_train, y_train) |
Traceback (most recent call last):
File"", line 1, in
classifier.fit(X_train, y_train)File
"C:\Anaconda3\envs\py35\lib\site-packages\sklearn
aive_bayes.py",
line 566, in fit
Y = labelbin.fit_transform(y)File
"C:\Anaconda3\envs\py35\lib\site-packages\sklearn\base.py",
line 494, in fit_transform
return self.fit(X, **fit_params).transform(X)File
"C:\Anaconda3\envs\py35\lib\site-packages\sklearn\preprocessing\label.py",
line 296, in fit
self.y_type_ = type_of_target(y)File
"C:\Anaconda3\envs\py35\lib\site-packages\sklearn\utils\multiclass.py",
line 275, in type_of_target
if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):File
"C:\Anaconda3\envs\py35\lib\site-packages
umpy\lib\arraysetops.py",
line 198, in unique
ar.sort()TypeError: unorderable types: str() > float()
我只想说,去掉
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | .... .... # Other code .... .... X = cv.fit_transform(corpus) y = dataset.iloc[:, 3].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0) # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import MultinomialNB classifier = MultinomialNB() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) .... .... # Other code |