copro-api/api/data/statistics.py
2020-07-29 23:41:16 +02:00

65 lines
1.7 KiB
Python

import os
import pandas as pd
from copro.settings import BASE_DIR
# TODO: put in env var and import it
data_path = os.path.join(BASE_DIR, 'api/data/annonces.csv')
def get_data_from_csv(path):
"""
Create dataframe from csv filepath. Keep only needed columns
`path` is the csv file location.
"""
# import data as dataframe
raw_data = pd.read_csv(path)
# keep only useful columns
feature_cols = ['CONDOMINIUM_EXPENSES', 'DEPT_CODE', 'ZIP_CODE', 'CITY']
data = raw_data.dropna(subset=feature_cols)
return data
def get_condo_expenses_by(col, value):
"""
Return mean, 1st and 9th decile condominium expenses values for the given query type.
`Col` can either be: `DEPT_CODE`, `CITY` or `ZIP_CODE`.
`Value` is the actual query parameter.
"""
assert col in ('DEPT_CODE', 'CITY',
'ZIP_CODE'), "col must be 'dept', 'city' or 'zip'"
data = get_data_from_csv(data_path)
# group data by column and compute statistics
group = data.groupby(col, as_index=False)['CONDOMINIUM_EXPENSES']
mean = group.mean()
first_quantile = group.quantile(0.1)
ninth_quantile = group.quantile(0.9)
# build filtering condition
if col == 'DEPT_CODE':
condition = mean['DEPT_CODE'] == value
elif col == 'CITY':
condition = mean['CITY'] == value
else:
condition = mean['ZIP_CODE'] == value
# TODO: refactor
mean = mean['CONDOMINIUM_EXPENSES'][condition].iloc[0]
first_quantile = first_quantile['CONDOMINIUM_EXPENSES'][condition].iloc[0]
ninth_quantile = ninth_quantile['CONDOMINIUM_EXPENSES'][condition].iloc[0]
return {
"mean": mean,
"1st_quantile": first_quantile,
"9th_quantile": ninth_quantile,
}