关于python:自定义Sklearn Transformer单独工作,在管道中使用时引发错误

Custom Sklearn Transformer works alone, Throws Error When Used in Pipeline

我有一个简单的sklearn课程,我想作为sklearn管道的一部分使用。这个类只取一个pandas数据框X_DF和一个分类列名,调用pd.get_dummies返回数据框,该列变成一个虚拟变量的矩阵…

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator

class dummy_var_encoder(TransformerMixin, BaseEstimator):
    '''Convert selected categorical column to (set of) dummy variables    
    '''



    def __init__(self, column_to_dummy='default_col_name'):
        self.column = column_to_dummy
        print self.column

    def fit(self, X_DF, y=None):
        return self

    def transform(self, X_DF):
        ''' Update X_DF to have set of dummy-variables instead of orig column'''        

        # convert self-attribute to local var for ease of stepping through function
        column = self.column

        # add columns for new dummy vars, and drop original categorical column
        dummy_matrix = pd.get_dummies(X_DF[column], prefix=column)

        new_DF = pd.concat([X_DF[column], dummy_matrix], axis=1)

        return new_DF

现在,使用这个变压器来适应/转换,我得到了预期的输出。一些玩具数据如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from sklearn import datasets
# Load toy data
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = pd.Series(iris.target, name='y')

# Create Arbitrary categorical features
X['category_1'] = pd.cut(X['sepal length (cm)'],
                         bins=3,
                         labels=['small', 'medium', 'large'])

X['category_2'] = pd.cut(X['sepal width (cm)'],
                         bins=3,
                         labels=['small', 'medium', 'large'])

…我的虚拟编码器产生正确的输出:

1
2
3
4
5
6
7
8
9
10
11
12
encoder = dummy_var_encoder(column_to_dummy = 'category_1')
encoder.fit(X)
encoder.transform(X).iloc[15:21,:]

category_1
   category_1  category_1_small  category_1_medium  category_1_large
15     medium                 0                  1                 0
16      small                 1                  0                 0
17      small                 1                  0                 0
18     medium                 0                  1                 0
19      small                 1                  0                 0
20      small                 1                  0                 0

但是,当我从如下定义的sklearn管道调用相同的变压器时:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV

# Define Pipeline
clf = LogisticRegression(penalty='l1')
pipeline_steps = [('dummy_vars', dummy_var_encoder()),
                  ('clf', clf)
                  ]

pipeline = Pipeline(pipeline_steps)

# Define hyperparams try for dummy-encoder and classifier
# Fit 4 models - try dummying category_1 vs category_2, and using l1 vs l2 penalty in log-reg
param_grid = {'dummy_vars__column_to_dummy': ['category_1', 'category_2'],
              'clf__penalty': ['l1', 'l2']
                  }

# Define full model search process
cv_model_search = GridSearchCV(pipeline,
                               param_grid,
                               scoring='accuracy',
                               cv = KFold(),
                               refit=True,
                               verbose = 3)

在我安装管道之前一切都很好,这时我从虚拟编码器中得到一个错误:

1
cv_model_search.fit(X,y=y)

In [101]: cv_model_search.fit(X,y=y) Fitting 3 folds for each of 4
candidates, totalling 12 fits

None None None None
[CV] dummy_vars__column_to_dummy=category_1, clf__penalty=l1 .........

Traceback (most recent call last):

File"", line 1, in
cv_model_search.fit(X,y=y)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/model_selection/_search.py",
line 638, in fit
cv.split(X, y, groups)))

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 779, in call
while self.dispatch_one_batch(iterator):

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 625, in dispatch_one_batch
self._dispatch(tasks)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py",
line 111, in apply_async
result = ImmediateResult(func)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py",
line 332, in init
self.results = batch()

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py",
line 131, in call
return [func(*args, **kwargs) for func, args, kwargs in self.items]

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/model_selection/_validation.py",
line 437, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/pipeline.py",
line 257, in fit
Xt, fit_params = self._fit(X, y, **fit_params)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/pipeline.py",
line 222, in _fit
**fit_params_steps[name])

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/externals/joblib/memory.py",
line 362, in call
return self.func(*args, **kwargs)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/pipeline.py",
line 589, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/sklearn/base.py",
line 521, in fit_transform
return self.fit(X, y, **fit_params).transform(X)

File"", line 21, in transform
dummy_matrix = pd.get_dummies(X_DF[column], prefix=column)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/frame.py",
line 1964, in getitem
return self._getitem_column(key)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/frame.py",
line 1971, in _getitem_column
return self._get_item_cache(key)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/generic.py",
line 1645, in _get_item_cache
values = self._data.get(item)

File
"/home/max/anaconda3/envs/remine/lib/python2.7/site-packages/pandas/core/internals.py",
line 3599, in get
raise ValueError("cannot label index with a null key")

ValueError: cannot label index with a null key


痕迹告诉你到底出了什么问题。学习诊断跟踪是非常宝贵的,尤其是当您从您可能不完全理解的库继承时。

现在,我已经在sklearn中做了一些继承工作,我可以毫无疑问地告诉您,如果输入到fitfit_transform方法的数据类型不是numpy数组,GridSearchCV将给您带来一些麻烦。正如Vivek在他的评论中提到的,传递给fit方法的x不再是一个数据帧。但让我们先看看这个痕迹。

ValueError: cannot label index with a null key

虽然Vivek对numpy数组是正确的,但这里还有另一个问题。您得到的实际错误是,fit方法中的column的值为无。如果您查看上面的encoder对象,您将看到__repr__方法输出以下内容:

1
dummy_var_encoder(column_to_dummy=None)

当使用Pipeline时,这个参数被初始化并传递给GridSearchCV。这是一种可以在整个交叉验证和搜索方法中看到的行为,并且具有与输入参数同名的属性会导致类似的问题。解决这个问题会让你走上正确的道路。

修改__init__方法将解决这个具体问题:

1
2
3
def __init__(self, column='default_col_name'):
    self.column = column
    print(self.column)

然而,一旦你做到了这一点,维韦克提到的问题将成为它的后盾,你将不得不处理。这是我以前遇到过的事情,尽管没有专门针对数据帧。我提出了一个在自定义类上使用sklearn GridSearchCV的解决方案,该类的fit方法采用3个参数。基本上,我创建了一个包装器来实现__getitem__方法,使数据的外观和行为能够通过GridSearchCVPipeline和其他交叉验证方法中使用的验证方法。

编辑

我做了这些更改,看起来您的问题来自于验证方法check_array。虽然用dtype=pd.DataFrame调用这个方法是可行的,但是线性模型使用dtype=np.float64抛出一个错误来调用这个方法。要绕过这个问题,而不是将原始数据与您的虚拟对象连接起来,您只需返回虚拟列并使用这些列进行拟合。这是无论如何都应该做的事情,因为您不想在您试图适应的模型中同时包含伪列和原始数据。你也可以考虑选择drop_first,但我不想谈这个问题。因此,像这样更改fit方法可以使整个过程按预期工作。

1
2
3
4
5
6
7
8
9
10
def transform(self, X_DF):
    ''' Update X_DF to have set of dummy-variables instead of orig column'''        

    # convert self-attribute to local var for ease of stepping through function
    column = self.column

    # add columns for new dummy vars, and drop original categorical column
    dummy_matrix = pd.get_dummies(X_DF[column], prefix=column)

    return dummy_matrix