copro-api/api/data/prediction.py
2020-07-29 23:41:16 +02:00

83 lines
2.7 KiB
Python

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# TODO: use one hot encoding to handle unknown values in transform step
from sklearn.preprocessing import LabelEncoder
from .statistics import data_path
class LinearRegressor:
def __init__(self):
self.X, self.y = self.preprocessing()
def preprocessing(self):
"""Prepare data for modelling."""
# read csv
raw_data = pd.read_csv(
data_path, dtype={'DEPT_CODE': str, 'ZIP_CODE': str, 'INSEE_CODE': str})
# drop missing values from significative columns
feature_cols = ['CONDOMINIUM_EXPENSES',
'DEPT_CODE', 'ZIP_CODE', 'CITY', 'SURFACE']
data = raw_data.dropna(subset=feature_cols)
# columns used in modelling
cols = ['ZIP_CODE', 'DEPT_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE',
'LONGITUDE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR']
# target
y = data['CONDOMINIUM_EXPENSES']
# features
X = data[cols]
# surface rows are not correctly typed I do it manually
X['SURFACE'] = X['SURFACE'].astype(float)
# split dataset for training
X_train, X_val, y_train, y_val = train_test_split(
X, y, train_size=0.7, random_state=0)
# fill missing values with most frequent values. TODO: suboptimal method
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
cols = X_train.dtypes == 'object'
cat_cols = list(cols[cols].index)
cols = X_train.dtypes != 'object'
num_cols = list(cols[cols].index)
# handle missing values on categorical columns
imputer = SimpleImputer(strategy='constant')
X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])
X_val[cat_cols] = imputer.transform(X_val[cat_cols])
# encode categorical data
encoder = LabelEncoder()
for col in cat_cols:
X_train[col] = encoder.fit_transform(X_train[col])
# handle numerical missing values
num_imputer = SimpleImputer(strategy='mean')
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_val[num_cols] = num_imputer.transform(X_val[num_cols])
return X_train, y_train
def get_trained_model(self):
lin_reg = LinearRegression().fit(self.X, self.y)
return lin_reg
def predict_expenses(self, data):
"""Return condominium expenses as a function of data."""
model = self.get_trained_model()
return model.predict(data)