Regressor test with Skflow and Sklearn
Some regressor tests with skflow and sklearn
regressor.py
# -*- coding: utf-8 -*-
import pandas
import numpy as np
import matplotlib.pyplot as plt
from pylab import savefig
from sklearn import datasets, cross_validation, metrics
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
import skflow
########################################################
# Set data
boston = datasets.load_boston()
X, y = boston.data, boston.target
# Split dataset into train / test
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y,
test_size=0.1, random_state=42)
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
########################################################
# Fit regression model
# 1. SVR
regressor = SVR(kernel='rbf', C=1000, gamma='auto')
# 2. Gradient Boosting
"""
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01, 'loss': 'ls'}
regressor = ensemble.GradientBoostingRegressor(**params)
"""
# 3. Random Forest
"""
regressor = RandomForestRegressor(n_estimators=200, min_samples_split=1)
"""
# 4. DNN
"""
regressor = skflow.TensorFlowDNNRegressor(hidden_units=[10, 10, 10],
steps=20000, learning_rate=0.01, batch_size=13)
"""
########################################################
# Train and Predict
regressor.fit(X_train, y_train)
score = metrics.mean_squared_error(regressor.predict(scaler.transform(X_test)), y_test)
X_ty = regressor.predict(X_train)
score1 = metrics.mean_squared_error(X_ty, y_train)
print('Test MSE: {0:f}'.format(score))
print('Train MSE: {0:f}'.format(score1))
########################################################
# Look at the results
"""
fig,ax = plt.subplots()
ax.scatter(y_train, X_ty)
ax.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
"""
"""
savefig('result.png')
"""
Data preprocessing
pw.csv from Data Set
Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,
Sub_metering_2,Sub_metering_3
16/12/2006,17:24:00,4.216,0.418,234.840,18.400,0.000,1.000,17.000
…
data = pandas.read_csv('pw.csv')
X = data[['Date','Time','Global_reactive_power','Voltage','Global_intensity','Sub_metering_1',
'Sub_metering_2','Sub_metering_3']]
y = data['Global_active_power']
D = (pandas.to_datetime(X["Date"], format="%d/%m/%Y"))
X["Month"] = D.apply(lambda x: x.month)
X = X.drop(["Date"], axis=1)
T = (pandas.to_datetime(X["Time"], format="%H:%M:%S"))
X["Hour"] = T.apply(lambda x: x.hour)
X = X.drop(["Time"], axis=1)