From 35816b96c2d14c965888e8f38ebf16f5311578f9 Mon Sep 17 00:00:00 2001 From: Ruidy Nemausat Date: Wed, 29 Jul 2020 23:41:16 +0200 Subject: [PATCH] 2h --- .gitignore | 4 + Pipfile | 17 + Pipfile.lock | 190 +++++ README.md | 25 + api/__init__.py | 0 api/admin.py | 3 + api/apps.py | 5 + api/data/model.ipynb | 1568 ++++++++++++++++++++++++++++++++++++++++ api/data/prediction.py | 83 +++ api/data/statistics.py | 65 ++ api/models.py | 18 + api/serializers.py | 15 + api/tests.py | 3 + api/urls.py | 8 + api/views.py | 29 + copro/__init__.py | 0 copro/asgi.py | 16 + copro/settings.py | 122 ++++ copro/urls.py | 22 + copro/wsgi.py | 16 + manage.py | 21 + 21 files changed, 2230 insertions(+) create mode 100644 .gitignore create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 README.md create mode 100644 api/__init__.py create mode 100644 api/admin.py create mode 100644 api/apps.py create mode 100644 api/data/model.ipynb create mode 100644 api/data/prediction.py create mode 100644 api/data/statistics.py create mode 100644 api/models.py create mode 100644 api/serializers.py create mode 100644 api/tests.py create mode 100644 api/urls.py create mode 100644 api/views.py create mode 100644 copro/__init__.py create mode 100644 copro/asgi.py create mode 100644 copro/settings.py create mode 100644 copro/urls.py create mode 100644 copro/wsgi.py create mode 100755 manage.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c9117ce --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +**/api/migrations +**/api/data/.ipynb_checkpoints +**/api/data/*csv +db* diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..e9fda6b --- /dev/null +++ b/Pipfile @@ -0,0 +1,17 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +pandas = "*" +djangorestframework = "*" +django = "*" +sklearn = "*" +numpy = "*" +joblib = "*" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..cc7326f --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,190 @@ +{ + "_meta": { + "hash": { + "sha256": "b61f243cbc1d68a628203e9bf1210e97d98d5e74516535f2cb22fccf2471043b" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "asgiref": { + "hashes": [ + "sha256:7e51911ee147dd685c3c8b805c0ad0cb58d360987b56953878f8c06d2d1c6f1a", + "sha256:9fc6fb5d39b8af147ba40765234fa822b39818b12cc80b35ad9b0cef3a476aed" + ], + "version": "==3.2.10" + }, + "django": { + "hashes": [ + "sha256:31a5fbbea5fc71c99e288ec0b2f00302a0a92c44b13ede80b73a6a4d6d205582", + "sha256:5457fc953ec560c5521b41fad9e6734a4668b7ba205832191bbdff40ec61073c" + ], + "index": "pypi", + "version": "==3.0.8" + }, + "djangorestframework": { + "hashes": [ + "sha256:05809fc66e1c997fd9a32ea5730d9f4ba28b109b9da71fccfa5ff241201fd0a4", + "sha256:e782087823c47a26826ee5b6fa0c542968219263fb3976ec3c31edab23a4001f" + ], + "index": "pypi", + "version": "==3.11.0" + }, + "joblib": { + "hashes": [ + "sha256:8f52bf24c64b608bf0b2563e0e47d6fcf516abc8cfafe10cfd98ad66d94f92d6", + "sha256:d348c5d4ae31496b2aa060d6d9b787864dd204f9480baaa52d18850cb43e9f49" + ], + "index": "pypi", + "version": "==0.16.0" + }, + "numpy": { + "hashes": [ + "sha256:082f8d4dd69b6b688f64f509b91d482362124986d98dc7dc5f5e9f9b9c3bb983", + "sha256:1bc0145999e8cb8aed9d4e65dd8b139adf1919e521177f198529687dbf613065", + "sha256:309cbcfaa103fc9a33ec16d2d62569d541b79f828c382556ff072442226d1968", + "sha256:3673c8b2b29077f1b7b3a848794f8e11f401ba0b71c49fbd26fb40b71788b132", + "sha256:480fdd4dbda4dd6b638d3863da3be82873bba6d32d1fc12ea1b8486ac7b8d129", + "sha256:56ef7f56470c24bb67fb43dae442e946a6ce172f97c69f8d067ff8550cf782ff", + "sha256:5a936fd51049541d86ccdeef2833cc89a18e4d3808fe58a8abeb802665c5af93", + "sha256:5b6885c12784a27e957294b60f97e8b5b4174c7504665333c5e94fbf41ae5d6a", + "sha256:667c07063940e934287993366ad5f56766bc009017b4a0fe91dbd07960d0aba7", + "sha256:7ed448ff4eaffeb01094959b19cbaf998ecdee9ef9932381420d514e446601cd", + "sha256:8343bf67c72e09cfabfab55ad4a43ce3f6bf6e6ced7acf70f45ded9ebb425055", + "sha256:92feb989b47f83ebef246adabc7ff3b9a59ac30601c3f6819f8913458610bdcc", + "sha256:935c27ae2760c21cd7354402546f6be21d3d0c806fffe967f745d5f2de5005a7", + "sha256:aaf42a04b472d12515debc621c31cf16c215e332242e7a9f56403d814c744624", + "sha256:b12e639378c741add21fbffd16ba5ad25c0a1a17cf2b6fe4288feeb65144f35b", + "sha256:b1cca51512299841bf69add3b75361779962f9cee7d9ee3bb446d5982e925b69", + "sha256:b8456987b637232602ceb4d663cb34106f7eb780e247d51a260b84760fd8f491", + "sha256:b9792b0ac0130b277536ab8944e7b754c69560dac0415dd4b2dbd16b902c8954", + "sha256:c9591886fc9cbe5532d5df85cb8e0cc3b44ba8ce4367bd4cf1b93dc19713da72", + "sha256:cf1347450c0b7644ea142712619533553f02ef23f92f781312f6a3553d031fc7", + "sha256:de8b4a9b56255797cbddb93281ed92acbc510fb7b15df3f01bd28f46ebc4edae", + "sha256:e1b1dc0372f530f26a03578ac75d5e51b3868b9b76cd2facba4c9ee0eb252ab1", + "sha256:e45f8e981a0ab47103181773cc0a54e650b2aef8c7b6cd07405d0fa8d869444a", + "sha256:e4f6d3c53911a9d103d8ec9518190e52a8b945bab021745af4939cfc7c0d4a9e", + "sha256:ed8a311493cf5480a2ebc597d1e177231984c818a86875126cfd004241a73c3e", + "sha256:ef71a1d4fd4858596ae80ad1ec76404ad29701f8ca7cdcebc50300178db14dfc" + ], + "index": "pypi", + "version": "==1.19.1" + }, + "pandas": { + "hashes": [ + "sha256:0210f8fe19c2667a3817adb6de2c4fd92b1b78e1975ca60c0efa908e0985cbdb", + "sha256:0227e3a6e3a22c0e283a5041f1e3064d78fbde811217668bb966ed05386d8a7e", + "sha256:0bc440493cf9dc5b36d5d46bbd5508f6547ba68b02a28234cd8e81fdce42744d", + "sha256:16504f915f1ae424052f1e9b7cd2d01786f098fbb00fa4e0f69d42b22952d798", + "sha256:182a5aeae319df391c3df4740bb17d5300dcd78034b17732c12e62e6dd79e4a4", + "sha256:35db623487f00d9392d8af44a24516d6cb9f274afaf73cfcfe180b9c54e007d2", + "sha256:40ec0a7f611a3d00d3c666c4cceb9aa3f5bf9fbd81392948a93663064f527203", + "sha256:47a03bfef80d6812c91ed6fae43f04f2fa80a4e1b82b35aa4d9002e39529e0b8", + "sha256:4b21d46728f8a6be537716035b445e7ef3a75dbd30bd31aa1b251323219d853e", + "sha256:4d1a806252001c5db7caecbe1a26e49a6c23421d85a700960f6ba093112f54a1", + "sha256:60e20a4ab4d4fec253557d0fc9a4e4095c37b664f78c72af24860c8adcd07088", + "sha256:9f61cca5262840ff46ef857d4f5f65679b82188709d0e5e086a9123791f721c8", + "sha256:a15835c8409d5edc50b4af93be3377b5dd3eb53517e7f785060df1f06f6da0e2", + "sha256:b39508562ad0bb3f384b0db24da7d68a2608b9ddc85b1d931ccaaa92d5e45273", + "sha256:ed60848caadeacecefd0b1de81b91beff23960032cded0ac1449242b506a3b3f", + "sha256:fc714895b6de6803ac9f661abb316853d0cd657f5d23985222255ad76ccedc25" + ], + "index": "pypi", + "version": "==1.1.0" + }, + "python-dateutil": { + "hashes": [ + "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", + "sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a" + ], + "version": "==2.8.1" + }, + "pytz": { + "hashes": [ + "sha256:a494d53b6d39c3c6e44c3bec237336e14305e4f29bbf800b599253057fbb79ed", + "sha256:c35965d010ce31b23eeb663ed3cc8c906275d6be1a34393a1d73a41febf4a048" + ], + "version": "==2020.1" + }, + "scikit-learn": { + "hashes": [ + "sha256:04799686060ecbf8992f26a35be1d99e981894c8c7860c1365cda4200f954a16", + "sha256:058d213092de4384710137af1300ed0ff030b8c40459a6c6f73c31ccd274cc39", + "sha256:0c3464e46ef8bd4f1bfa5c009648c6449412c8f7e9b3fc0c9e3d800139c48827", + "sha256:0e7b55f73b35537ecd0d19df29dd39aa9e076dba78f3507b8136c819d84611fd", + "sha256:16feae4361be6b299d4d08df5a30956b4bfc8eadf173fe9258f6d59630f851d4", + "sha256:244ca85d6eba17a1e6e8a66ab2f584be6a7784b5f59297e3d7ff8c7983af627c", + "sha256:3e6e92b495eee193a8fa12a230c9b7976ea0fc1263719338e35c986ea1e42cff", + "sha256:5bcea4d6ee431c814261117281363208408aa4e665633655895feb059021aca6", + "sha256:93f56abd316d131645559ec0ab4f45e3391c2ccdd4eadaa4912f4c1e0a6f2c96", + "sha256:9e04c0811ea92931ee8490d638171b8cb2f21387efcfff526bbc8c2a3da60f1c", + "sha256:bded94236e16774385202cafd26190ce96db18e4dc21e99473848c61e4fdc400", + "sha256:c2fa33d20408b513cf432505c80e6eb4bf4d71434f1ae36680765d4a2c2a16ec", + "sha256:e3fec1c8831f8f93ad85581ca29ca1bb88e2da377fb097cf8322aa89c21bc9b8", + "sha256:e585682e37f2faa81ad6cd4472fff646bf2fd0542147bec93697a905db8e6bd2", + "sha256:e9879ba9e64ec3add41bf201e06034162f853652ef4849b361d73b0deb3153ad", + "sha256:ebe853e6f318f9d8b3b74dd17e553720d35646eff675a69eeaed12fbbbb07daa" + ], + "version": "==0.23.1" + }, + "scipy": { + "hashes": [ + "sha256:066c513d90eb3fd7567a9e150828d39111ebd88d3e924cdfc9f8ce19ab6f90c9", + "sha256:07e52b316b40a4f001667d1ad4eb5f2318738de34597bd91537851365b6c61f1", + "sha256:0a0e9a4e58a4734c2eba917f834b25b7e3b6dc333901ce7784fd31aefbd37b2f", + "sha256:1c7564a4810c1cd77fcdee7fa726d7d39d4e2695ad252d7c86c3ea9d85b7fb8f", + "sha256:315aa2165aca31375f4e26c230188db192ed901761390be908c9b21d8b07df62", + "sha256:6e86c873fe1335d88b7a4bfa09d021f27a9e753758fd75f3f92d714aa4093768", + "sha256:8e28e74b97fc8d6aa0454989db3b5d36fc27e69cef39a7ee5eaf8174ca1123cb", + "sha256:92eb04041d371fea828858e4fff182453c25ae3eaa8782d9b6c32b25857d23bc", + "sha256:a0afbb967fd2c98efad5f4c24439a640d39463282040a88e8e928db647d8ac3d", + "sha256:a785409c0fa51764766840185a34f96a0a93527a0ff0230484d33a8ed085c8f8", + "sha256:cca9fce15109a36a0a9f9cfc64f870f1c140cb235ddf27fe0328e6afb44dfed0", + "sha256:d56b10d8ed72ec1be76bf10508446df60954f08a41c2d40778bc29a3a9ad9bce", + "sha256:dac09281a0eacd59974e24525a3bc90fa39b4e95177e638a31b14db60d3fa806", + "sha256:ec5fe57e46828d034775b00cd625c4a7b5c7d2e354c3b258d820c6c72212a6ec", + "sha256:eecf40fa87eeda53e8e11d265ff2254729d04000cd40bae648e76ff268885d66", + "sha256:fc98f3eac993b9bfdd392e675dfe19850cc8c7246a8fd2b42443e506344be7d9" + ], + "version": "==1.5.2" + }, + "six": { + "hashes": [ + "sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259", + "sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced" + ], + "version": "==1.15.0" + }, + "sklearn": { + "hashes": [ + "sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31" + ], + "index": "pypi", + "version": "==0.0" + }, + "sqlparse": { + "hashes": [ + "sha256:022fb9c87b524d1f7862b3037e541f68597a730a8843245c349fc93e1643dc4e", + "sha256:e162203737712307dfe78860cc56c8da8a852ab2ee33750e33aeadf38d12c548" + ], + "version": "==0.3.1" + }, + "threadpoolctl": { + "hashes": [ + "sha256:38b74ca20ff3bb42caca8b00055111d74159ee95c4370882bbff2b93d24da725", + "sha256:ddc57c96a38beb63db45d6c159b5ab07b6bced12c45a1f07b2b92f272aebfa6b" + ], + "version": "==2.1.0" + } + }, + "develop": {} +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..6d41eb5 --- /dev/null +++ b/README.md @@ -0,0 +1,25 @@ +# Exercice dev python + +MeilleureCopro souhaite développer une API de statistiques sur des données d’annonces immobilières pour des besoins internes. + +Afin de valider cette demande, votre objectif est de construire en un minimum de temps une API qui répondra aux besoins suivants des utilisateurs internes MeilleureCopro : + +- L'utilisateur doit pouvoir connaître les charges de copropriétés moyennes, les quantiles 10%, 90% sur un département, une ville ou code postal +- L'utilisateur interne doit pouvoir très simplement interroger la base de données via un navigateur +- L'utilisateur doit pouvoir rentrer les informations suivantes sur un appartement au sein d'une copropriété : + + - une adresse, + - la surface de l'appartement, + - la présence d'ascenseur, + - la présence de chauffage collectif + + et obtenir une estimation des charges de l'appartement. Vous utiliserez un modèle pour faire cette estimation. + +Voici le dataset d’annonces immobilières sur lequel vous calculerez les statistiques demandées : [dataset](https://storage.googleapis.com/data.meilleurecopro.com/stage/dataset_annonces.csv.tar.gz). + +## Outils choisis + +- pandas +- scikit-learn +- Django +- Django REST Framework diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/admin.py b/api/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/api/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/api/apps.py b/api/apps.py new file mode 100644 index 0000000..d87006d --- /dev/null +++ b/api/apps.py @@ -0,0 +1,5 @@ +from django.apps import AppConfig + + +class ApiConfig(AppConfig): + name = 'api' diff --git a/api/data/model.ipynb b/api/data/model.ipynb new file mode 100644 index 0000000..f36724d --- /dev/null +++ b/api/data/model.ipynb @@ -0,0 +1,1568 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Modelling" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (13) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDLATITUDELONGITUDEBLUR_RADIUSPRICECONDOMINIUM_EXPENSESWATER_HEATING_MODEFLOORFLOOR_COUNTLOT_COUNT...TERRACETERRACE_SURFACESWIMMING_POOLGARDENSTANDINGSMALL_BUILDINGCORNER_BUILDINGPUBLICATION_START_DATEDEALER_NAMEDEALER_TYPE
count0.02.346152e+061.222870e+061.221350e+062.346125e+069.259670e+050.01.729398e+061.076405e+061.014007e+06...0.00.00.00.00.00.00.00.00.00.0
meanNaN4.573629e+012.913308e+004.661755e+022.338131e+051.211348e+04NaN2.163156e+009.598156e+009.701759e+05...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
stdNaN6.898158e+005.784470e+003.041044e+022.942130e+054.006808e+06NaN7.338205e+005.448560e+037.661597e+08...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
minNaN-2.138532e+01-6.177292e+015.000000e+011.000000e+001.000000e-02NaN-9.000000e+000.000000e+000.000000e+00...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
25%NaN4.362788e+011.408058e+002.500000e+021.120000e+057.200000e+02NaN1.000000e+002.000000e+001.500000e+01...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
50%NaN4.649788e+012.441038e+005.000000e+021.690000e+051.200000e+03NaN2.000000e+004.000000e+004.300000e+01...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
75%NaN4.883240e+015.383308e+007.500000e+022.590000e+052.000000e+03NaN3.000000e+005.000000e+001.030000e+02...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
maxNaN5.108570e+015.572088e+011.000000e+037.500000e+071.920350e+09NaN2.019000e+035.652800e+067.610212e+11...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

8 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " ID LATITUDE LONGITUDE BLUR_RADIUS PRICE \\\n", + "count 0.0 2.346152e+06 1.222870e+06 1.221350e+06 2.346125e+06 \n", + "mean NaN 4.573629e+01 2.913308e+00 4.661755e+02 2.338131e+05 \n", + "std NaN 6.898158e+00 5.784470e+00 3.041044e+02 2.942130e+05 \n", + "min NaN -2.138532e+01 -6.177292e+01 5.000000e+01 1.000000e+00 \n", + "25% NaN 4.362788e+01 1.408058e+00 2.500000e+02 1.120000e+05 \n", + "50% NaN 4.649788e+01 2.441038e+00 5.000000e+02 1.690000e+05 \n", + "75% NaN 4.883240e+01 5.383308e+00 7.500000e+02 2.590000e+05 \n", + "max NaN 5.108570e+01 5.572088e+01 1.000000e+03 7.500000e+07 \n", + "\n", + " CONDOMINIUM_EXPENSES WATER_HEATING_MODE FLOOR FLOOR_COUNT \\\n", + "count 9.259670e+05 0.0 1.729398e+06 1.076405e+06 \n", + "mean 1.211348e+04 NaN 2.163156e+00 9.598156e+00 \n", + "std 4.006808e+06 NaN 7.338205e+00 5.448560e+03 \n", + "min 1.000000e-02 NaN -9.000000e+00 0.000000e+00 \n", + "25% 7.200000e+02 NaN 1.000000e+00 2.000000e+00 \n", + "50% 1.200000e+03 NaN 2.000000e+00 4.000000e+00 \n", + "75% 2.000000e+03 NaN 3.000000e+00 5.000000e+00 \n", + "max 1.920350e+09 NaN 2.019000e+03 5.652800e+06 \n", + "\n", + " LOT_COUNT ... TERRACE TERRACE_SURFACE SWIMMING_POOL GARDEN \\\n", + "count 1.014007e+06 ... 0.0 0.0 0.0 0.0 \n", + "mean 9.701759e+05 ... NaN NaN NaN NaN \n", + "std 7.661597e+08 ... NaN NaN NaN NaN \n", + "min 0.000000e+00 ... NaN NaN NaN NaN \n", + "25% 1.500000e+01 ... NaN NaN NaN NaN \n", + "50% 4.300000e+01 ... NaN NaN NaN NaN \n", + "75% 1.030000e+02 ... NaN NaN NaN NaN \n", + "max 7.610212e+11 ... NaN NaN NaN NaN \n", + "\n", + " STANDING SMALL_BUILDING CORNER_BUILDING PUBLICATION_START_DATE \\\n", + "count 0.0 0.0 0.0 0.0 \n", + "mean NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN \n", + "\n", + " DEALER_NAME DEALER_TYPE \n", + "count 0.0 0.0 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + "[8 rows x 22 columns]" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_path = 'annonces.csv'\n", + "raw_data = pd.read_csv(data_path,dtype={'DEPT_CODE': str, 'ZIP_CODE': str, 'INSEE_CODE': str})\n", + "raw_data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Le dataset contient 2346152 entrées et 39 colonnes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The aim is to predict CONDOMINIUM_EXPENSES so this is the target." + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDAD_URLSPROPERTY_TYPEDEPT_CODEZIP_CODECITYINSEE_CODELATITUDELONGITUDEBLUR_RADIUS...GARDENSTANDINGNEW_BUILDSMALL_BUILDINGCORNER_BUILDINGPUBLICATION_START_DATEDEALER_NAMEDEALER_TYPEREFERENCE_NUMBERENERGY_CLASSIFICATION
8NaNhttps://www.bienici.com/annonce/adapt-immo-850...APARTMENT0101750Saint-Laurent-sur-Saône0137046.3100724.8418131000.0...NaNNaNFalseNaNNaNNaNNaNNaN8500236963NS
9NaNhttps://www.bienici.com/annonce/orpi-1-099934E...APARTMENT0101750Saint-Laurent-sur-Saône0137046.3044414.84114150.0...NaNNaNFalseNaNNaNNaNNaNNaN099934E0KVCCNaN
15NaNhttps://www.bienici.com/annonce/fnaim-39964032APARTMENT0101000Bourg-en-Bresse0105346.205119NaNNaN...NaNNaNFalseNaNNaNNaNNaNNaN25871NaN
17NaNhttps://www.bienici.com/annonce/ag752451-11061...APARTMENT0101300Belley0103445.7421455.6868731000.0...NaNNaNFalseNaNNaNNaNNaNNaN421601048A
18NaNhttps://www.bienici.com/annonce/ag752451-11278...APARTMENT0101140Thoissey0142046.1695114.7842521000.0...NaNNaNFalseNaNNaNNaNNaNNaN421601005C
..................................................................
2346135NaNhttps://www.bienici.com/annonce/fnaim-37667773APARTMENT7878100Saint-Germain-en-Laye7855148.899041NaNNaN...NaNNaNFalseNaNNaNNaNNaNNaN7000D
2346136NaNhttps://www.bienici.com/annonce/fnaim-38342524APARTMENT7878100Saint-Germain-en-Laye7855148.899041NaNNaN...NaNNaNFalseNaNNaNNaNNaNNaN7000D
2346138NaNhttps://www.bienici.com/annonce/pericles-15861344APARTMENT7878100Saint-Germain-en-Laye7855148.8979472.10034050.0...NaNNaNFalseNaNNaNNaNNaNNaN15861344E
2346140NaNhttps://www.bienici.com/annonce/immo-facile-22...APARTMENT7878100Saint-Germain-en-Laye7855148.8950682.096111250.0...NaNNaNFalseNaNNaNNaNNaNNaN3984D
2346142NaNhttps://www.bienici.com/annonce/snpi-1106555APARTMENT7878000Versailles7864648.803540NaNNaN...NaNNaNFalseNaNNaNNaNNaNNaNAZ1-2014NaN
\n", + "

919093 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " ID AD_URLS PROPERTY_TYPE \\\n", + "8 NaN https://www.bienici.com/annonce/adapt-immo-850... APARTMENT \n", + "9 NaN https://www.bienici.com/annonce/orpi-1-099934E... APARTMENT \n", + "15 NaN https://www.bienici.com/annonce/fnaim-39964032 APARTMENT \n", + "17 NaN https://www.bienici.com/annonce/ag752451-11061... APARTMENT \n", + "18 NaN https://www.bienici.com/annonce/ag752451-11278... APARTMENT \n", + "... .. ... ... \n", + "2346135 NaN https://www.bienici.com/annonce/fnaim-37667773 APARTMENT \n", + "2346136 NaN https://www.bienici.com/annonce/fnaim-38342524 APARTMENT \n", + "2346138 NaN https://www.bienici.com/annonce/pericles-15861344 APARTMENT \n", + "2346140 NaN https://www.bienici.com/annonce/immo-facile-22... APARTMENT \n", + "2346142 NaN https://www.bienici.com/annonce/snpi-1106555 APARTMENT \n", + "\n", + " DEPT_CODE ZIP_CODE CITY INSEE_CODE LATITUDE \\\n", + "8 01 01750 Saint-Laurent-sur-Saône 01370 46.310072 \n", + "9 01 01750 Saint-Laurent-sur-Saône 01370 46.304441 \n", + "15 01 01000 Bourg-en-Bresse 01053 46.205119 \n", + "17 01 01300 Belley 01034 45.742145 \n", + "18 01 01140 Thoissey 01420 46.169511 \n", + "... ... ... ... ... ... \n", + "2346135 78 78100 Saint-Germain-en-Laye 78551 48.899041 \n", + "2346136 78 78100 Saint-Germain-en-Laye 78551 48.899041 \n", + "2346138 78 78100 Saint-Germain-en-Laye 78551 48.897947 \n", + "2346140 78 78100 Saint-Germain-en-Laye 78551 48.895068 \n", + "2346142 78 78000 Versailles 78646 48.803540 \n", + "\n", + " LONGITUDE BLUR_RADIUS ... GARDEN STANDING NEW_BUILD \\\n", + "8 4.841813 1000.0 ... NaN NaN False \n", + "9 4.841141 50.0 ... NaN NaN False \n", + "15 NaN NaN ... NaN NaN False \n", + "17 5.686873 1000.0 ... NaN NaN False \n", + "18 4.784252 1000.0 ... NaN NaN False \n", + "... ... ... ... ... ... ... \n", + "2346135 NaN NaN ... NaN NaN False \n", + "2346136 NaN NaN ... NaN NaN False \n", + "2346138 2.100340 50.0 ... NaN NaN False \n", + "2346140 2.096111 250.0 ... NaN NaN False \n", + "2346142 NaN NaN ... NaN NaN False \n", + "\n", + " SMALL_BUILDING CORNER_BUILDING PUBLICATION_START_DATE DEALER_NAME \\\n", + "8 NaN NaN NaN NaN \n", + "9 NaN NaN NaN NaN \n", + "15 NaN NaN NaN NaN \n", + "17 NaN NaN NaN NaN \n", + "18 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "2346135 NaN NaN NaN NaN \n", + "2346136 NaN NaN NaN NaN \n", + "2346138 NaN NaN NaN NaN \n", + "2346140 NaN NaN NaN NaN \n", + "2346142 NaN NaN NaN NaN \n", + "\n", + " DEALER_TYPE REFERENCE_NUMBER ENERGY_CLASSIFICATION \n", + "8 NaN 8500236963 NS \n", + "9 NaN 099934E0KVCC NaN \n", + "15 NaN 25871 NaN \n", + "17 NaN 421601048 A \n", + "18 NaN 421601005 C \n", + "... ... ... ... \n", + "2346135 NaN 7000 D \n", + "2346136 NaN 7000 D \n", + "2346138 NaN 15861344 E \n", + "2346140 NaN 3984 D \n", + "2346142 NaN AZ1-2014 NaN \n", + "\n", + "[919093 rows x 39 columns]" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# drop rows without CONDOMINIUM_EXPENSES, DEPT_CODE, ZIP_CODE, CITY\n", + "feature_cols = ['CONDOMINIUM_EXPENSES', 'DEPT_CODE', 'ZIP_CODE', 'CITY','SURFACE']\n", + "data = raw_data.dropna(subset=feature_cols)\n", + "data.describe()\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "925967 labelled entries" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing values per feature:\n", + "ZIP_CODE:\t0\n", + "DEPT_CODE:\t0\n", + "CITY:\t0\n", + "INSEE_CODE:\t0\n", + "LATITUDE:\t0\n", + "LONGITUDE:\t335521\n", + "SURFACE:\t0\n", + "HEATING_MODE:\t550030\n", + "ELEVATOR:\t545593\n" + ] + } + ], + "source": [ + "NaNs = [f\"{col}:\\t{data[col].isnull().sum()}\" for col in data.columns]\n", + "print(\"Missing values per feature:\")\n", + "print(\"\\n\".join(NaNs))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training step: baseline" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "# Given missing values we will consider only these columns\n", + "#cols = ['PROPERTY_TYPE', 'DEPT_CODE', 'ZIP_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE', 'LONGITUDE', \n", + "# 'MARKETING_TYPE', 'PRICE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR', 'FLOOR', 'PARKING', \n", + "# 'PARKING_COUNT', 'NEW_BUILD', 'ENERGY_CLASSIFICATION'] \n", + "cols = ['ZIP_CODE','DEPT_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE', 'LONGITUDE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR'] " + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " This is separate from the ipykernel package so we can avoid doing imports until\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ZIP_CODEDEPT_CODECITYINSEE_CODELATITUDELONGITUDESURFACEHEATING_MODEELEVATOR
80175001Saint-Laurent-sur-Saône0137046.3100724.84181345.0NaNNaN
90175001Saint-Laurent-sur-Saône0137046.3044414.84114129.5NaNNaN
150100001Bourg-en-Bresse0105346.205119NaN19.0NaNNaN
170130001Belley0103445.7421455.68687359.0NaNFalse
180114001Thoissey0142046.1695114.78425268.0COLLECTIVENaN
\n", + "
" + ], + "text/plain": [ + " ZIP_CODE DEPT_CODE CITY INSEE_CODE LATITUDE \\\n", + "8 01750 01 Saint-Laurent-sur-Saône 01370 46.310072 \n", + "9 01750 01 Saint-Laurent-sur-Saône 01370 46.304441 \n", + "15 01000 01 Bourg-en-Bresse 01053 46.205119 \n", + "17 01300 01 Belley 01034 45.742145 \n", + "18 01140 01 Thoissey 01420 46.169511 \n", + "\n", + " LONGITUDE SURFACE HEATING_MODE ELEVATOR \n", + "8 4.841813 45.0 NaN NaN \n", + "9 4.841141 29.5 NaN NaN \n", + "15 NaN 19.0 NaN NaN \n", + "17 5.686873 59.0 NaN False \n", + "18 4.784252 68.0 COLLECTIVE NaN " + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y = data['CONDOMINIUM_EXPENSES']\n", + "X = data[cols]\n", + "X['SURFACE'] = X['SURFACE'].astype(float)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nan 'COLLECTIVE' 'INDIVIDUAL']\n", + "[nan False True]\n", + "[ 45. 29.5 19. ... 208.82 195.8 213.43]\n", + "['01' '02' '03' '04' '06' '07' '08' '09' '93' '11' '12' '67' '13' '14'\n", + " '15' '17' '16' '18' '19' '2A' '21' '22' '23' '79' '24' '25' '26' '91'\n", + " '28' '27' '29' '30' '32' '33' '97' '2B' '31' '43' '52' '05' '70' '74'\n", + " '65' '87' '68' '92' '34' '35' '37' '36' '38' '39' '40' '44' '42' '41'\n", + " '45' '47' '46' '48' '49' '50' '51' '53' '54' '55' '56' '57' '58' '59'\n", + " '60' '61' '75' '62' '63' '64' '66' '69' '71' '72' '73' '77' '76' '80'\n", + " '82' '81' '90' '94' '95' '83' '84' '85' '88' '89' '78']\n" + ] + } + ], + "source": [ + "print(X['HEATING_MODE'].unique())\n", + "print(X['ELEVATOR'].unique())\n", + "print(X['SURFACE'].unique())\n", + "print(X['DEPT_CODE'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": true + }, + "outputs": [ + { + "ename": "IndexError", + "evalue": "single positional indexer is out-of-bounds", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'SURFACE'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'SURFACE'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1766\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1767\u001b[0m \u001b[0mmaybe_callable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1768\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmaybe_callable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1769\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 2136\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[0;31m# validate the location\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2138\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2140\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_integer\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 2061\u001b[0m \u001b[0mlen_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2062\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mlen_axis\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mlen_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2063\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"single positional indexer is out-of-bounds\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2064\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2065\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_tuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtup\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mIndexError\u001b[0m: single positional indexer is out-of-bounds" + ] + } + ], + "source": [ + "from math import isnan\n", + "\n", + "for i, row in X.iterrows():\n", + " x = X['SURFACE'].iloc[i]\n", + " if not isinstance(x,str):\n", + " X.at[i,'SURFACE'] = 0\n", + " elif isinstance(x,str):\n", + " try:\n", + " X.at[i,'SURFACE'] = float(x)\n", + " except:\n", + " X.at[i,'SURFACE'] = 0\n", + " else:\n", + " X.at[i,'SURFACE'] = 0\n", + "\n", + "print(len(list(X.iterrows())))\n", + "X['SURFACE'].values\n", + "#X['DEPT_CODE'].apply(lambda x: str(x))\n", + "#x = X['DEPT_CODE'].iloc[1]\n", + "#isinstance(x,float)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8 94.0\n", + "9 37.0\n", + "15 35.0\n", + "17 34.0\n", + "18 29.0\n", + " ... \n", + "2346135 244.0\n", + "2346136 244.0\n", + "2346138 190.0\n", + "2346140 210.0\n", + "2346142 171.0\n", + "Name: SURFACE, Length: 919093, dtype: float64" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pos = 1 \n", + "if isinstance(X['DEPT_CODE'].iloc[pos], float):\n", + " X['DEPT_CODE'].iloc[pos] = str(int(X['DEPT_CODE'].iloc[pos]))\n", + "X['DEPT_CODE'].iloc[pos]\n", + "X['DEPT_CODE'].iloc[61540]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ZIP_CODE': '21000', 'DEPT_CODE': '06', 'CITY': 'Toulouse', 'INSEE_CODE': '31555', 'LATITUDE': 43.6044621670154, 'LONGITUDE': 1.09373271, 'SURFACE': 65.0, 'HEATING_MODE': 'INDIVIDUAL', 'ELEVATOR': True}\n" + ] + } + ], + "source": [ + "train_mode = dict(X_train.mode().iloc[0])\n", + "X_train = X_train.fillna(train_mode)\n", + "print(train_mode)" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ZIP_CODE', 'DEPT_CODE', 'CITY', 'INSEE_CODE', 'HEATING_MODE']\n", + "['LATITUDE', 'LONGITUDE', 'SURFACE', 'ELEVATOR']\n" + ] + } + ], + "source": [ + "cols = X_train.dtypes == 'object'\n", + "cat_cols = list(cols[cols].index)\n", + "cols = X_train.dtypes != 'object'\n", + "num_cols = list(cols[cols].index)\n", + "print(cat_cols)\n", + "print(num_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 45. , 29.5, 19. , ..., 190. , 210. , 171. ])" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X['SURFACE'].values" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.impute import SimpleImputer" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " after removing the cwd from sys.path.\n", + "/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self.obj[item] = s\n" + ] + } + ], + "source": [ + "# handle missing values on categorical columns\n", + "imputer = SimpleImputer(strategy='constant')\n", + "X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])\n", + "X_val[cat_cols] = imputer.transform(X_val[cat_cols])" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ZIP_CODE\n", + "DEPT_CODE\n", + "CITY\n", + "INSEE_CODE\n", + "HEATING_MODE\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ZIP_CODEDEPT_CODECITYINSEE_CODELATITUDELONGITUDESURFACEHEATING_MODEELEVATOR
1171461525174933243.5649787.07896964.00True
157490167581327143.5515201.09373380.01True
11394111188343856177848.1113391.09373376.00True
18406592621723267387645.4307981.09373335.51True
16994292180621123314245.7774551.09373359.01True
..............................
22652233166841060488846.5050001.09373398.01True
3981235361664673247.0822772.39856663.01True
31369435811361152043.3530615.47124061.00True
11579381173343856177848.1113391.093733103.01True
761889954304933146643.5827851.09373395.01True
\n", + "

643365 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " ZIP_CODE DEPT_CODE CITY INSEE_CODE LATITUDE LONGITUDE SURFACE \\\n", + "117146 152 5 1749 332 43.564978 7.078969 64.0 \n", + "157490 167 5 813 271 43.551520 1.093733 80.0 \n", + "1139411 1188 34 3856 1778 48.111339 1.093733 76.0 \n", + "1840659 2621 72 3267 3876 45.430798 1.093733 35.5 \n", + "1699429 2180 62 1123 3142 45.777455 1.093733 59.0 \n", + "... ... ... ... ... ... ... ... \n", + "2265223 3166 84 1060 4888 46.505000 1.093733 98.0 \n", + "398123 536 16 646 732 47.082277 2.398566 63.0 \n", + "313694 358 11 3611 520 43.353061 5.471240 61.0 \n", + "1157938 1173 34 3856 1778 48.111339 1.093733 103.0 \n", + "761889 954 30 4933 1466 43.582785 1.093733 95.0 \n", + "\n", + " HEATING_MODE ELEVATOR \n", + "117146 0 True \n", + "157490 1 True \n", + "1139411 0 True \n", + "1840659 1 True \n", + "1699429 1 True \n", + "... ... ... \n", + "2265223 1 True \n", + "398123 1 True \n", + "313694 0 True \n", + "1157938 1 True \n", + "761889 1 True \n", + "\n", + "[643365 rows x 9 columns]" + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "#encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n", + "labEnc = LabelEncoder()\n", + "\n", + "for col in cat_cols:\n", + " X_train[col] = labEnc.fit_transform(X_train[col])\n", + "# X_val[col] = labEnc.transform(X_val[col])\n", + "\n", + "X_train\n", + "\n", + "#OH_X_train = pd.DataFrame(encoder.fit_transform(X_train[cat_cols]))\n", + "#OH_X_val = encoder.transform(X_val[cat_cols])\n", + "\n", + "#OH_X_train" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " after removing the cwd from sys.path.\n", + "/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self.obj[item] = s\n" + ] + } + ], + "source": [ + "num_imputer = SimpleImputer(strategy='mean')\n", + "\n", + "X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])\n", + "X_val[num_cols] = num_imputer.transform(X_val[num_cols])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#from sklearn.metrics import mean_absolute_error pour evaluer erreur\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [], + "source": [ + "lin_reg = LinearRegression().fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_absolute_error" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "could not convert string to float: 'carcassonne'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpreds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlin_reg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0mReturns\u001b[0m \u001b[0mpredicted\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 235\u001b[0m \"\"\"\n\u001b[0;32m--> 236\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decision_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0m_preprocess_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstaticmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_preprocess_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36m_decision_function\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 218\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'coo'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 219\u001b[0m return safe_sparse_dot(X, self.coef_.T,\n\u001b[1;32m 220\u001b[0m dense_output=True) + self.intercept_\n", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 71\u001b[0m FutureWarning)\n\u001b[1;32m 72\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcasting\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"unsafe\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 600\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 601\u001b[0m raise ValueError(\"Complex data not supported\\n\"\n", + "\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/numpy/core/_asarray.py\u001b[0m in \u001b[0;36masarray\u001b[0;34m(a, dtype, order)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \"\"\"\n\u001b[0;32m---> 85\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'carcassonne'" + ] + } + ], + "source": [ + "preds = lin_reg.predict(X_val)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.7 64-bit ('.env': venv)", + "language": "python", + "name": "python37764bitenvvenv28a5e1c45c144c06bbb2a58e0b29bacb" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/api/data/prediction.py b/api/data/prediction.py new file mode 100644 index 0000000..57dcef9 --- /dev/null +++ b/api/data/prediction.py @@ -0,0 +1,83 @@ +import numpy as np +import pandas as pd +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +# TODO: use one hot encoding to handle unknown values in transform step +from sklearn.preprocessing import LabelEncoder + +from .statistics import data_path + + +class LinearRegressor: + + def __init__(self): + self.X, self.y = self.preprocessing() + + def preprocessing(self): + """Prepare data for modelling.""" + + # read csv + raw_data = pd.read_csv( + data_path, dtype={'DEPT_CODE': str, 'ZIP_CODE': str, 'INSEE_CODE': str}) + + # drop missing values from significative columns + feature_cols = ['CONDOMINIUM_EXPENSES', + 'DEPT_CODE', 'ZIP_CODE', 'CITY', 'SURFACE'] + data = raw_data.dropna(subset=feature_cols) + + # columns used in modelling + cols = ['ZIP_CODE', 'DEPT_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE', + 'LONGITUDE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR'] + + # target + y = data['CONDOMINIUM_EXPENSES'] + # features + X = data[cols] + + # surface rows are not correctly typed I do it manually + X['SURFACE'] = X['SURFACE'].astype(float) + + # split dataset for training + X_train, X_val, y_train, y_val = train_test_split( + X, y, train_size=0.7, random_state=0) + + # fill missing values with most frequent values. TODO: suboptimal method + train_mode = dict(X_train.mode().iloc[0]) + X_train = X_train.fillna(train_mode) + + cols = X_train.dtypes == 'object' + cat_cols = list(cols[cols].index) + cols = X_train.dtypes != 'object' + num_cols = list(cols[cols].index) + + # handle missing values on categorical columns + imputer = SimpleImputer(strategy='constant') + X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols]) + X_val[cat_cols] = imputer.transform(X_val[cat_cols]) + + # encode categorical data + encoder = LabelEncoder() + + for col in cat_cols: + X_train[col] = encoder.fit_transform(X_train[col]) + + # handle numerical missing values + num_imputer = SimpleImputer(strategy='mean') + + X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols]) + X_val[num_cols] = num_imputer.transform(X_val[num_cols]) + + return X_train, y_train + + def get_trained_model(self): + lin_reg = LinearRegression().fit(self.X, self.y) + + return lin_reg + + def predict_expenses(self, data): + """Return condominium expenses as a function of data.""" + + model = self.get_trained_model() + + return model.predict(data) diff --git a/api/data/statistics.py b/api/data/statistics.py new file mode 100644 index 0000000..7604458 --- /dev/null +++ b/api/data/statistics.py @@ -0,0 +1,65 @@ +import os + +import pandas as pd + +from copro.settings import BASE_DIR + +# TODO: put in env var and import it +data_path = os.path.join(BASE_DIR, 'api/data/annonces.csv') + + +def get_data_from_csv(path): + """ + Create dataframe from csv filepath. Keep only needed columns + + `path` is the csv file location. + """ + + # import data as dataframe + raw_data = pd.read_csv(path) + + # keep only useful columns + feature_cols = ['CONDOMINIUM_EXPENSES', 'DEPT_CODE', 'ZIP_CODE', 'CITY'] + data = raw_data.dropna(subset=feature_cols) + + return data + + +def get_condo_expenses_by(col, value): + """ + Return mean, 1st and 9th decile condominium expenses values for the given query type. + + `Col` can either be: `DEPT_CODE`, `CITY` or `ZIP_CODE`. + + `Value` is the actual query parameter. + """ + + assert col in ('DEPT_CODE', 'CITY', + 'ZIP_CODE'), "col must be 'dept', 'city' or 'zip'" + + data = get_data_from_csv(data_path) + + # group data by column and compute statistics + group = data.groupby(col, as_index=False)['CONDOMINIUM_EXPENSES'] + mean = group.mean() + first_quantile = group.quantile(0.1) + ninth_quantile = group.quantile(0.9) + + # build filtering condition + if col == 'DEPT_CODE': + condition = mean['DEPT_CODE'] == value + elif col == 'CITY': + condition = mean['CITY'] == value + else: + condition = mean['ZIP_CODE'] == value + + # TODO: refactor + mean = mean['CONDOMINIUM_EXPENSES'][condition].iloc[0] + first_quantile = first_quantile['CONDOMINIUM_EXPENSES'][condition].iloc[0] + ninth_quantile = ninth_quantile['CONDOMINIUM_EXPENSES'][condition].iloc[0] + + return { + "mean": mean, + "1st_quantile": first_quantile, + "9th_quantile": ninth_quantile, + } diff --git a/api/models.py b/api/models.py new file mode 100644 index 0000000..4beda26 --- /dev/null +++ b/api/models.py @@ -0,0 +1,18 @@ +from django.db import models + + +class CondominiumExpense(models.Model): + mean = models.FloatField() + first_quantile = models.FloatField() + ninth_quantile = models.FloatField() + + +class CondominiumExpenseQuery(models.Model): + QUERY_TYPES = [ + ('DEPT_CODE', 'Department'), + ('ZIP_CODE', 'Postal Code'), + ('CITY', 'City'), + ] + query_type = models.CharField( + max_length=20, choices=QUERY_TYPES, default='DEPT_CODE') + value = models.CharField(max_length=40, blank=True, default="") diff --git a/api/serializers.py b/api/serializers.py new file mode 100644 index 0000000..421f8aa --- /dev/null +++ b/api/serializers.py @@ -0,0 +1,15 @@ +from rest_framework import serializers + +from .models import CondominiumExpense, CondominiumExpenseQuery + + +class CondominiumSerializer(serializers.ModelSerializer): + class Meta: + model = CondominiumExpense + fields = '__all__' + + +class CondominiumQuerySerializer(serializers.ModelSerializer): + class Meta: + model = CondominiumExpenseQuery + fields = '__all__' diff --git a/api/tests.py b/api/tests.py new file mode 100644 index 0000000..7ce503c --- /dev/null +++ b/api/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/api/urls.py b/api/urls.py new file mode 100644 index 0000000..132e2b0 --- /dev/null +++ b/api/urls.py @@ -0,0 +1,8 @@ +from django.urls import path + +from .views import get_condominium_expenses, make_expenses_prediction + +urlpatterns = [ + path('api/v1/', get_condominium_expenses), + path('api/v1/predict/', make_expenses_prediction), +] diff --git a/api/views.py b/api/views.py new file mode 100644 index 0000000..b495749 --- /dev/null +++ b/api/views.py @@ -0,0 +1,29 @@ +from rest_framework.decorators import api_view +from rest_framework.response import Response + +from .data.prediction import LinearRegressor +from .data.statistics import get_condo_expenses_by + + +@api_view(['GET', 'POST']) +def get_condominium_expenses(request): + """TODO: refactor using APIView class""" + + data = {} + if request.method == "POST": + query_type = request.data["query_type"] + query_val = request.data['query_val'] + + data = get_condo_expenses_by(query_type, query_val) + + return Response(data) + + +@api_view(['POST']) +def make_expenses_prediction(request): + """Return condominium expenses prediction.""" + + algo = LinearRegressor + prediction = algo.predict_expenses(request.data) + + return Response(prediction) diff --git a/copro/__init__.py b/copro/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/copro/asgi.py b/copro/asgi.py new file mode 100644 index 0000000..1367182 --- /dev/null +++ b/copro/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for copro project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/3.0/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'copro.settings') + +application = get_asgi_application() diff --git a/copro/settings.py b/copro/settings.py new file mode 100644 index 0000000..ac616f3 --- /dev/null +++ b/copro/settings.py @@ -0,0 +1,122 @@ +""" +Django settings for copro project. + +Generated by 'django-admin startproject' using Django 3.0.4. + +For more information on this file, see +https://docs.djangoproject.com/en/3.0/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/3.0/ref/settings/ +""" + +import os + +# Build paths inside the project like this: os.path.join(BASE_DIR, ...) +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/3.0/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = '6)akoq-jb7%!ydxtji%ss+xk7o&@e-&=eep(4r2%3zz^^&4wzv' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'rest_framework', + 'api.apps.ApiConfig' +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'copro.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'copro.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/3.0/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + } +} + + +# Password validation +# https://docs.djangoproject.com/en/3.0/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/3.0/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_L10N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/3.0/howto/static-files/ + +STATIC_URL = '/static/' diff --git a/copro/urls.py b/copro/urls.py new file mode 100644 index 0000000..db24c71 --- /dev/null +++ b/copro/urls.py @@ -0,0 +1,22 @@ +"""copro URL Configuration + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/3.0/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import include, path + +urlpatterns = [ + path('admin/', admin.site.urls), + path('', include('api.urls')) +] diff --git a/copro/wsgi.py b/copro/wsgi.py new file mode 100644 index 0000000..03f84d4 --- /dev/null +++ b/copro/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for copro project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/3.0/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'copro.settings') + +application = get_wsgi_application() diff --git a/manage.py b/manage.py new file mode 100755 index 0000000..39c5c74 --- /dev/null +++ b/manage.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'copro.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main()