import pandas as pd from sklearn.model_selection import train_test_split
# Load the data data = pd.read_csv('XXX.csv')
# Select target y = data.Target
# To keep things simple, we'll use only numerical predictors X = data.drop(['Target'], axis=1)
# Divide data into training and validation subsets X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
方法1
1 2 3 4 5 6 7
# Get names of columns with missing values cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]
# Drop columns in training and validation data reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
# Imputation removed column names; put them back imputed_X_train.columns = X_train.columns imputed_X_valid.columns = X_valid.columns
方法3
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# Make copy to avoid changing original data (when imputing) X_train_plus = X_train.copy() X_valid_plus = X_valid.copy()
# Make new columns indicating what will be imputed for col in cols_with_missing: X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull() X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
# Imputation removed column names; put them back imputed_X_train_plus.columns = X_train_plus.columns imputed_X_valid_plus.columns = X_valid_plus.columns