got error:Input contains NaN, infinity or a value too large for dtype('float64')
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | ## Load the data ## train=pd.read_csv("../kagglehouse/train.csv") test=pd.read_csv("../kagglehouse/test.csv") all_data=pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],test.loc[:,"MSSubClass":"SaleCondition"])) NFOLDS = 5 SEED = 0 NROWS = None ntrain = train.shape[0] ntest = test.shape[0] #creating matrices for sklearn 1: y_train=train["SalePrice"] x_train = np.array(all_data[:train.shape[0]]) x_test = np.array(all_data[train.shape[0]:]) kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED) class SklearnWrapper(object): def __init__(self, clf, seed=0, params=None): params['random_state'] = seed self.clf = clf(**params) def train(self, x_train, y_train): self.clf.fit(train_df_munged, label_df) #self.clf.fit(x_train, y_train) def predict(self, x): return self.clf.predict(x) def get_oof(clf): oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFOLDS, ntest)) for i, (train_index, test_index) in enumerate(kf): x_tr = x_train[train_index] y_tr = y_train[train_index] x_te = x_train[test_index] clf.train(x_tr, y_tr) oof_train[test_index] = clf.predict(x_te) oof_test_skf[i, :] = clf.predict(x_test) oof_test[:] = oof_test_skf.mean(axis=0) return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) et_params = { 'n_jobs': 16, } rf_params = { 'n_jobs': 16, } xgb_params = { 'seed': 0, 'colsample_bytree': 0.7, 'silent': 1, 'subsample': 0.7, } rd_params={ 'alpha': 10 } ls_params={ 'alpha': 0.005 } et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params) rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params) rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params) ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params) et_oof_train, et_oof_test = get_oof(et) rf_oof_train, rf_oof_test = get_oof(rf) rd_oof_train, rd_oof_test = get_oof(rd) ls_oof_train, ls_oof_test = get_oof(ls) |
看起来是这样的
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | ValueError Traceback (most recent call last) in () 135 136 xg_oof_train, xg_oof_test = get_oof(xg) --> 137 et_oof_train, et_oof_test = get_oof(et) 138 rf_oof_train, rf_oof_test = get_oof(rf) 139 rd_oof_train, rd_oof_test = get_oof(rd) in get_oof(clf) 77 x_te = x_train[test_index] 78 ---> 79 clf.train(x_tr, y_tr) 80 81 oof_train[test_index] = clf.predict(x_te) in train(self, x_train, y_train) 46 def train(self, x_train, y_train): 47 #self.clf.fit(x_train, y_train) ---> 48 self.clf.fit(x_train, y_train) 49 50 def predict(self, x): E:\graphLab\Anaconda2\lib\site-packages\sklearn\ensemble\forest.pyc in fit(self, X, y, sample_weight) 245 # Validate or convert input data 246 X = check_array(X, accept_sparse="csc", dtype=DTYPE) --> 247 y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) 248 if sample_weight is not None: 249 sample_weight = check_array(sample_weight, ensure_2d=False) E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 420 % (array.ndim, estimator_name)) 421 if force_all_finite: --> 422 _assert_all_finite(array) 423 424 shape_repr = _shape_repr(array.shape) E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in _assert_all_finite(X) 41 and not np.isfinite(X).all()): 42 raise ValueError("Input contains NaN, infinity" ---> 43 " or a value too large for %r." % X.dtype) 44 45 ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). |
当我使用
您没有正确检查
1 2 | np.isnan( all_data.all() ) np.isfinite( all_data.all() ) |
不是你应该如何检查你的数据。
您将
您应该检查您的数据:
1 2 | np.isfinite( all_data ).all() np.isnan( all_data ).all() |
注意,