mirror of
https://github.com/rjNemo/copro-api
synced 2026-06-06 02:16:44 +00:00
83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.impute import SimpleImputer
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.model_selection import train_test_split
|
|
# TODO: use one hot encoding to handle unknown values in transform step
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
|
from .statistics import data_path
|
|
|
|
|
|
class LinearRegressor:
|
|
|
|
def __init__(self):
|
|
self.X, self.y = self.preprocessing()
|
|
|
|
def preprocessing(self):
|
|
"""Prepare data for modelling."""
|
|
|
|
# read csv
|
|
raw_data = pd.read_csv(
|
|
data_path, dtype={'DEPT_CODE': str, 'ZIP_CODE': str, 'INSEE_CODE': str})
|
|
|
|
# drop missing values from significative columns
|
|
feature_cols = ['CONDOMINIUM_EXPENSES',
|
|
'DEPT_CODE', 'ZIP_CODE', 'CITY', 'SURFACE']
|
|
data = raw_data.dropna(subset=feature_cols)
|
|
|
|
# columns used in modelling
|
|
cols = ['ZIP_CODE', 'DEPT_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE',
|
|
'LONGITUDE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR']
|
|
|
|
# target
|
|
y = data['CONDOMINIUM_EXPENSES']
|
|
# features
|
|
X = data[cols]
|
|
|
|
# surface rows are not correctly typed I do it manually
|
|
X['SURFACE'] = X['SURFACE'].astype(float)
|
|
|
|
# split dataset for training
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
X, y, train_size=0.7, random_state=0)
|
|
|
|
# fill missing values with most frequent values. TODO: suboptimal method
|
|
train_mode = dict(X_train.mode().iloc[0])
|
|
X_train = X_train.fillna(train_mode)
|
|
|
|
cols = X_train.dtypes == 'object'
|
|
cat_cols = list(cols[cols].index)
|
|
cols = X_train.dtypes != 'object'
|
|
num_cols = list(cols[cols].index)
|
|
|
|
# handle missing values on categorical columns
|
|
imputer = SimpleImputer(strategy='constant')
|
|
X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])
|
|
X_val[cat_cols] = imputer.transform(X_val[cat_cols])
|
|
|
|
# encode categorical data
|
|
encoder = LabelEncoder()
|
|
|
|
for col in cat_cols:
|
|
X_train[col] = encoder.fit_transform(X_train[col])
|
|
|
|
# handle numerical missing values
|
|
num_imputer = SimpleImputer(strategy='mean')
|
|
|
|
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
|
|
X_val[num_cols] = num_imputer.transform(X_val[num_cols])
|
|
|
|
return X_train, y_train
|
|
|
|
def get_trained_model(self):
|
|
lin_reg = LinearRegression().fit(self.X, self.y)
|
|
|
|
return lin_reg
|
|
|
|
def predict_expenses(self, data):
|
|
"""Return condominium expenses as a function of data."""
|
|
|
|
model = self.get_trained_model()
|
|
|
|
return model.predict(data)
|