copro-api/api/data/model.ipynb
2020-07-29 23:41:16 +02:00

1568 lines
64 KiB
Text
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Modelling"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Exploration"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (13) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>LATITUDE</th>\n",
" <th>LONGITUDE</th>\n",
" <th>BLUR_RADIUS</th>\n",
" <th>PRICE</th>\n",
" <th>CONDOMINIUM_EXPENSES</th>\n",
" <th>WATER_HEATING_MODE</th>\n",
" <th>FLOOR</th>\n",
" <th>FLOOR_COUNT</th>\n",
" <th>LOT_COUNT</th>\n",
" <th>...</th>\n",
" <th>TERRACE</th>\n",
" <th>TERRACE_SURFACE</th>\n",
" <th>SWIMMING_POOL</th>\n",
" <th>GARDEN</th>\n",
" <th>STANDING</th>\n",
" <th>SMALL_BUILDING</th>\n",
" <th>CORNER_BUILDING</th>\n",
" <th>PUBLICATION_START_DATE</th>\n",
" <th>DEALER_NAME</th>\n",
" <th>DEALER_TYPE</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>0.0</td>\n",
" <td>2.346152e+06</td>\n",
" <td>1.222870e+06</td>\n",
" <td>1.221350e+06</td>\n",
" <td>2.346125e+06</td>\n",
" <td>9.259670e+05</td>\n",
" <td>0.0</td>\n",
" <td>1.729398e+06</td>\n",
" <td>1.076405e+06</td>\n",
" <td>1.014007e+06</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>4.573629e+01</td>\n",
" <td>2.913308e+00</td>\n",
" <td>4.661755e+02</td>\n",
" <td>2.338131e+05</td>\n",
" <td>1.211348e+04</td>\n",
" <td>NaN</td>\n",
" <td>2.163156e+00</td>\n",
" <td>9.598156e+00</td>\n",
" <td>9.701759e+05</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>6.898158e+00</td>\n",
" <td>5.784470e+00</td>\n",
" <td>3.041044e+02</td>\n",
" <td>2.942130e+05</td>\n",
" <td>4.006808e+06</td>\n",
" <td>NaN</td>\n",
" <td>7.338205e+00</td>\n",
" <td>5.448560e+03</td>\n",
" <td>7.661597e+08</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>-2.138532e+01</td>\n",
" <td>-6.177292e+01</td>\n",
" <td>5.000000e+01</td>\n",
" <td>1.000000e+00</td>\n",
" <td>1.000000e-02</td>\n",
" <td>NaN</td>\n",
" <td>-9.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>4.362788e+01</td>\n",
" <td>1.408058e+00</td>\n",
" <td>2.500000e+02</td>\n",
" <td>1.120000e+05</td>\n",
" <td>7.200000e+02</td>\n",
" <td>NaN</td>\n",
" <td>1.000000e+00</td>\n",
" <td>2.000000e+00</td>\n",
" <td>1.500000e+01</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>4.649788e+01</td>\n",
" <td>2.441038e+00</td>\n",
" <td>5.000000e+02</td>\n",
" <td>1.690000e+05</td>\n",
" <td>1.200000e+03</td>\n",
" <td>NaN</td>\n",
" <td>2.000000e+00</td>\n",
" <td>4.000000e+00</td>\n",
" <td>4.300000e+01</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>4.883240e+01</td>\n",
" <td>5.383308e+00</td>\n",
" <td>7.500000e+02</td>\n",
" <td>2.590000e+05</td>\n",
" <td>2.000000e+03</td>\n",
" <td>NaN</td>\n",
" <td>3.000000e+00</td>\n",
" <td>5.000000e+00</td>\n",
" <td>1.030000e+02</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>5.108570e+01</td>\n",
" <td>5.572088e+01</td>\n",
" <td>1.000000e+03</td>\n",
" <td>7.500000e+07</td>\n",
" <td>1.920350e+09</td>\n",
" <td>NaN</td>\n",
" <td>2.019000e+03</td>\n",
" <td>5.652800e+06</td>\n",
" <td>7.610212e+11</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 22 columns</p>\n",
"</div>"
],
"text/plain": [
" ID LATITUDE LONGITUDE BLUR_RADIUS PRICE \\\n",
"count 0.0 2.346152e+06 1.222870e+06 1.221350e+06 2.346125e+06 \n",
"mean NaN 4.573629e+01 2.913308e+00 4.661755e+02 2.338131e+05 \n",
"std NaN 6.898158e+00 5.784470e+00 3.041044e+02 2.942130e+05 \n",
"min NaN -2.138532e+01 -6.177292e+01 5.000000e+01 1.000000e+00 \n",
"25% NaN 4.362788e+01 1.408058e+00 2.500000e+02 1.120000e+05 \n",
"50% NaN 4.649788e+01 2.441038e+00 5.000000e+02 1.690000e+05 \n",
"75% NaN 4.883240e+01 5.383308e+00 7.500000e+02 2.590000e+05 \n",
"max NaN 5.108570e+01 5.572088e+01 1.000000e+03 7.500000e+07 \n",
"\n",
" CONDOMINIUM_EXPENSES WATER_HEATING_MODE FLOOR FLOOR_COUNT \\\n",
"count 9.259670e+05 0.0 1.729398e+06 1.076405e+06 \n",
"mean 1.211348e+04 NaN 2.163156e+00 9.598156e+00 \n",
"std 4.006808e+06 NaN 7.338205e+00 5.448560e+03 \n",
"min 1.000000e-02 NaN -9.000000e+00 0.000000e+00 \n",
"25% 7.200000e+02 NaN 1.000000e+00 2.000000e+00 \n",
"50% 1.200000e+03 NaN 2.000000e+00 4.000000e+00 \n",
"75% 2.000000e+03 NaN 3.000000e+00 5.000000e+00 \n",
"max 1.920350e+09 NaN 2.019000e+03 5.652800e+06 \n",
"\n",
" LOT_COUNT ... TERRACE TERRACE_SURFACE SWIMMING_POOL GARDEN \\\n",
"count 1.014007e+06 ... 0.0 0.0 0.0 0.0 \n",
"mean 9.701759e+05 ... NaN NaN NaN NaN \n",
"std 7.661597e+08 ... NaN NaN NaN NaN \n",
"min 0.000000e+00 ... NaN NaN NaN NaN \n",
"25% 1.500000e+01 ... NaN NaN NaN NaN \n",
"50% 4.300000e+01 ... NaN NaN NaN NaN \n",
"75% 1.030000e+02 ... NaN NaN NaN NaN \n",
"max 7.610212e+11 ... NaN NaN NaN NaN \n",
"\n",
" STANDING SMALL_BUILDING CORNER_BUILDING PUBLICATION_START_DATE \\\n",
"count 0.0 0.0 0.0 0.0 \n",
"mean NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN \n",
"min NaN NaN NaN NaN \n",
"25% NaN NaN NaN NaN \n",
"50% NaN NaN NaN NaN \n",
"75% NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN \n",
"\n",
" DEALER_NAME DEALER_TYPE \n",
"count 0.0 0.0 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
"[8 rows x 22 columns]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_path = 'annonces.csv'\n",
"raw_data = pd.read_csv(data_path,dtype={'DEPT_CODE': str, 'ZIP_CODE': str, 'INSEE_CODE': str})\n",
"raw_data.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Le dataset contient 2346152 entrées et 39 colonnes."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The aim is to predict CONDOMINIUM_EXPENSES so this is the target."
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>AD_URLS</th>\n",
" <th>PROPERTY_TYPE</th>\n",
" <th>DEPT_CODE</th>\n",
" <th>ZIP_CODE</th>\n",
" <th>CITY</th>\n",
" <th>INSEE_CODE</th>\n",
" <th>LATITUDE</th>\n",
" <th>LONGITUDE</th>\n",
" <th>BLUR_RADIUS</th>\n",
" <th>...</th>\n",
" <th>GARDEN</th>\n",
" <th>STANDING</th>\n",
" <th>NEW_BUILD</th>\n",
" <th>SMALL_BUILDING</th>\n",
" <th>CORNER_BUILDING</th>\n",
" <th>PUBLICATION_START_DATE</th>\n",
" <th>DEALER_NAME</th>\n",
" <th>DEALER_TYPE</th>\n",
" <th>REFERENCE_NUMBER</th>\n",
" <th>ENERGY_CLASSIFICATION</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/adapt-immo-850...</td>\n",
" <td>APARTMENT</td>\n",
" <td>01</td>\n",
" <td>01750</td>\n",
" <td>Saint-Laurent-sur-Saône</td>\n",
" <td>01370</td>\n",
" <td>46.310072</td>\n",
" <td>4.841813</td>\n",
" <td>1000.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8500236963</td>\n",
" <td>NS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/orpi-1-099934E...</td>\n",
" <td>APARTMENT</td>\n",
" <td>01</td>\n",
" <td>01750</td>\n",
" <td>Saint-Laurent-sur-Saône</td>\n",
" <td>01370</td>\n",
" <td>46.304441</td>\n",
" <td>4.841141</td>\n",
" <td>50.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>099934E0KVCC</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/fnaim-39964032</td>\n",
" <td>APARTMENT</td>\n",
" <td>01</td>\n",
" <td>01000</td>\n",
" <td>Bourg-en-Bresse</td>\n",
" <td>01053</td>\n",
" <td>46.205119</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>25871</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/ag752451-11061...</td>\n",
" <td>APARTMENT</td>\n",
" <td>01</td>\n",
" <td>01300</td>\n",
" <td>Belley</td>\n",
" <td>01034</td>\n",
" <td>45.742145</td>\n",
" <td>5.686873</td>\n",
" <td>1000.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>421601048</td>\n",
" <td>A</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/ag752451-11278...</td>\n",
" <td>APARTMENT</td>\n",
" <td>01</td>\n",
" <td>01140</td>\n",
" <td>Thoissey</td>\n",
" <td>01420</td>\n",
" <td>46.169511</td>\n",
" <td>4.784252</td>\n",
" <td>1000.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>421601005</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2346135</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/fnaim-37667773</td>\n",
" <td>APARTMENT</td>\n",
" <td>78</td>\n",
" <td>78100</td>\n",
" <td>Saint-Germain-en-Laye</td>\n",
" <td>78551</td>\n",
" <td>48.899041</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7000</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2346136</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/fnaim-38342524</td>\n",
" <td>APARTMENT</td>\n",
" <td>78</td>\n",
" <td>78100</td>\n",
" <td>Saint-Germain-en-Laye</td>\n",
" <td>78551</td>\n",
" <td>48.899041</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>7000</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2346138</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/pericles-15861344</td>\n",
" <td>APARTMENT</td>\n",
" <td>78</td>\n",
" <td>78100</td>\n",
" <td>Saint-Germain-en-Laye</td>\n",
" <td>78551</td>\n",
" <td>48.897947</td>\n",
" <td>2.100340</td>\n",
" <td>50.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>15861344</td>\n",
" <td>E</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2346140</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/immo-facile-22...</td>\n",
" <td>APARTMENT</td>\n",
" <td>78</td>\n",
" <td>78100</td>\n",
" <td>Saint-Germain-en-Laye</td>\n",
" <td>78551</td>\n",
" <td>48.895068</td>\n",
" <td>2.096111</td>\n",
" <td>250.0</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3984</td>\n",
" <td>D</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2346142</th>\n",
" <td>NaN</td>\n",
" <td>https://www.bienici.com/annonce/snpi-1106555</td>\n",
" <td>APARTMENT</td>\n",
" <td>78</td>\n",
" <td>78000</td>\n",
" <td>Versailles</td>\n",
" <td>78646</td>\n",
" <td>48.803540</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>AZ1-2014</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>919093 rows × 39 columns</p>\n",
"</div>"
],
"text/plain": [
" ID AD_URLS PROPERTY_TYPE \\\n",
"8 NaN https://www.bienici.com/annonce/adapt-immo-850... APARTMENT \n",
"9 NaN https://www.bienici.com/annonce/orpi-1-099934E... APARTMENT \n",
"15 NaN https://www.bienici.com/annonce/fnaim-39964032 APARTMENT \n",
"17 NaN https://www.bienici.com/annonce/ag752451-11061... APARTMENT \n",
"18 NaN https://www.bienici.com/annonce/ag752451-11278... APARTMENT \n",
"... .. ... ... \n",
"2346135 NaN https://www.bienici.com/annonce/fnaim-37667773 APARTMENT \n",
"2346136 NaN https://www.bienici.com/annonce/fnaim-38342524 APARTMENT \n",
"2346138 NaN https://www.bienici.com/annonce/pericles-15861344 APARTMENT \n",
"2346140 NaN https://www.bienici.com/annonce/immo-facile-22... APARTMENT \n",
"2346142 NaN https://www.bienici.com/annonce/snpi-1106555 APARTMENT \n",
"\n",
" DEPT_CODE ZIP_CODE CITY INSEE_CODE LATITUDE \\\n",
"8 01 01750 Saint-Laurent-sur-Saône 01370 46.310072 \n",
"9 01 01750 Saint-Laurent-sur-Saône 01370 46.304441 \n",
"15 01 01000 Bourg-en-Bresse 01053 46.205119 \n",
"17 01 01300 Belley 01034 45.742145 \n",
"18 01 01140 Thoissey 01420 46.169511 \n",
"... ... ... ... ... ... \n",
"2346135 78 78100 Saint-Germain-en-Laye 78551 48.899041 \n",
"2346136 78 78100 Saint-Germain-en-Laye 78551 48.899041 \n",
"2346138 78 78100 Saint-Germain-en-Laye 78551 48.897947 \n",
"2346140 78 78100 Saint-Germain-en-Laye 78551 48.895068 \n",
"2346142 78 78000 Versailles 78646 48.803540 \n",
"\n",
" LONGITUDE BLUR_RADIUS ... GARDEN STANDING NEW_BUILD \\\n",
"8 4.841813 1000.0 ... NaN NaN False \n",
"9 4.841141 50.0 ... NaN NaN False \n",
"15 NaN NaN ... NaN NaN False \n",
"17 5.686873 1000.0 ... NaN NaN False \n",
"18 4.784252 1000.0 ... NaN NaN False \n",
"... ... ... ... ... ... ... \n",
"2346135 NaN NaN ... NaN NaN False \n",
"2346136 NaN NaN ... NaN NaN False \n",
"2346138 2.100340 50.0 ... NaN NaN False \n",
"2346140 2.096111 250.0 ... NaN NaN False \n",
"2346142 NaN NaN ... NaN NaN False \n",
"\n",
" SMALL_BUILDING CORNER_BUILDING PUBLICATION_START_DATE DEALER_NAME \\\n",
"8 NaN NaN NaN NaN \n",
"9 NaN NaN NaN NaN \n",
"15 NaN NaN NaN NaN \n",
"17 NaN NaN NaN NaN \n",
"18 NaN NaN NaN NaN \n",
"... ... ... ... ... \n",
"2346135 NaN NaN NaN NaN \n",
"2346136 NaN NaN NaN NaN \n",
"2346138 NaN NaN NaN NaN \n",
"2346140 NaN NaN NaN NaN \n",
"2346142 NaN NaN NaN NaN \n",
"\n",
" DEALER_TYPE REFERENCE_NUMBER ENERGY_CLASSIFICATION \n",
"8 NaN 8500236963 NS \n",
"9 NaN 099934E0KVCC NaN \n",
"15 NaN 25871 NaN \n",
"17 NaN 421601048 A \n",
"18 NaN 421601005 C \n",
"... ... ... ... \n",
"2346135 NaN 7000 D \n",
"2346136 NaN 7000 D \n",
"2346138 NaN 15861344 E \n",
"2346140 NaN 3984 D \n",
"2346142 NaN AZ1-2014 NaN \n",
"\n",
"[919093 rows x 39 columns]"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop rows without CONDOMINIUM_EXPENSES, DEPT_CODE, ZIP_CODE, CITY\n",
"feature_cols = ['CONDOMINIUM_EXPENSES', 'DEPT_CODE', 'ZIP_CODE', 'CITY','SURFACE']\n",
"data = raw_data.dropna(subset=feature_cols)\n",
"data.describe()\n",
"data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"925967 labelled entries"
]
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Missing values per feature:\n",
"ZIP_CODE:\t0\n",
"DEPT_CODE:\t0\n",
"CITY:\t0\n",
"INSEE_CODE:\t0\n",
"LATITUDE:\t0\n",
"LONGITUDE:\t335521\n",
"SURFACE:\t0\n",
"HEATING_MODE:\t550030\n",
"ELEVATOR:\t545593\n"
]
}
],
"source": [
"NaNs = [f\"{col}:\\t{data[col].isnull().sum()}\" for col in data.columns]\n",
"print(\"Missing values per feature:\")\n",
"print(\"\\n\".join(NaNs))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training step: baseline"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"# Given missing values we will consider only these columns\n",
"#cols = ['PROPERTY_TYPE', 'DEPT_CODE', 'ZIP_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE', 'LONGITUDE', \n",
"# 'MARKETING_TYPE', 'PRICE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR', 'FLOOR', 'PARKING', \n",
"# 'PARKING_COUNT', 'NEW_BUILD', 'ENERGY_CLASSIFICATION'] \n",
"cols = ['ZIP_CODE','DEPT_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE', 'LONGITUDE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR'] "
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ZIP_CODE</th>\n",
" <th>DEPT_CODE</th>\n",
" <th>CITY</th>\n",
" <th>INSEE_CODE</th>\n",
" <th>LATITUDE</th>\n",
" <th>LONGITUDE</th>\n",
" <th>SURFACE</th>\n",
" <th>HEATING_MODE</th>\n",
" <th>ELEVATOR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>01750</td>\n",
" <td>01</td>\n",
" <td>Saint-Laurent-sur-Saône</td>\n",
" <td>01370</td>\n",
" <td>46.310072</td>\n",
" <td>4.841813</td>\n",
" <td>45.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>01750</td>\n",
" <td>01</td>\n",
" <td>Saint-Laurent-sur-Saône</td>\n",
" <td>01370</td>\n",
" <td>46.304441</td>\n",
" <td>4.841141</td>\n",
" <td>29.5</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>01000</td>\n",
" <td>01</td>\n",
" <td>Bourg-en-Bresse</td>\n",
" <td>01053</td>\n",
" <td>46.205119</td>\n",
" <td>NaN</td>\n",
" <td>19.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>01300</td>\n",
" <td>01</td>\n",
" <td>Belley</td>\n",
" <td>01034</td>\n",
" <td>45.742145</td>\n",
" <td>5.686873</td>\n",
" <td>59.0</td>\n",
" <td>NaN</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>01140</td>\n",
" <td>01</td>\n",
" <td>Thoissey</td>\n",
" <td>01420</td>\n",
" <td>46.169511</td>\n",
" <td>4.784252</td>\n",
" <td>68.0</td>\n",
" <td>COLLECTIVE</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ZIP_CODE DEPT_CODE CITY INSEE_CODE LATITUDE \\\n",
"8 01750 01 Saint-Laurent-sur-Saône 01370 46.310072 \n",
"9 01750 01 Saint-Laurent-sur-Saône 01370 46.304441 \n",
"15 01000 01 Bourg-en-Bresse 01053 46.205119 \n",
"17 01300 01 Belley 01034 45.742145 \n",
"18 01140 01 Thoissey 01420 46.169511 \n",
"\n",
" LONGITUDE SURFACE HEATING_MODE ELEVATOR \n",
"8 4.841813 45.0 NaN NaN \n",
"9 4.841141 29.5 NaN NaN \n",
"15 NaN 19.0 NaN NaN \n",
"17 5.686873 59.0 NaN False \n",
"18 4.784252 68.0 COLLECTIVE NaN "
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y = data['CONDOMINIUM_EXPENSES']\n",
"X = data[cols]\n",
"X['SURFACE'] = X['SURFACE'].astype(float)\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[nan 'COLLECTIVE' 'INDIVIDUAL']\n",
"[nan False True]\n",
"[ 45. 29.5 19. ... 208.82 195.8 213.43]\n",
"['01' '02' '03' '04' '06' '07' '08' '09' '93' '11' '12' '67' '13' '14'\n",
" '15' '17' '16' '18' '19' '2A' '21' '22' '23' '79' '24' '25' '26' '91'\n",
" '28' '27' '29' '30' '32' '33' '97' '2B' '31' '43' '52' '05' '70' '74'\n",
" '65' '87' '68' '92' '34' '35' '37' '36' '38' '39' '40' '44' '42' '41'\n",
" '45' '47' '46' '48' '49' '50' '51' '53' '54' '55' '56' '57' '58' '59'\n",
" '60' '61' '75' '62' '63' '64' '66' '69' '71' '72' '73' '77' '76' '80'\n",
" '82' '81' '90' '94' '95' '83' '84' '85' '88' '89' '78']\n"
]
}
],
"source": [
"print(X['HEATING_MODE'].unique())\n",
"print(X['ELEVATOR'].unique())\n",
"print(X['SURFACE'].unique())\n",
"print(X['DEPT_CODE'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": true
},
"outputs": [
{
"ename": "IndexError",
"evalue": "single positional indexer is out-of-bounds",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-79-9b684df73459>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'SURFACE'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'SURFACE'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1766\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1767\u001b[0m \u001b[0mmaybe_callable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1768\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmaybe_callable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1769\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 2136\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[0;31m# validate the location\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2138\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2140\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_integer\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 2061\u001b[0m \u001b[0mlen_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2062\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mlen_axis\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mlen_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2063\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"single positional indexer is out-of-bounds\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2064\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2065\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_tuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtup\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIndexError\u001b[0m: single positional indexer is out-of-bounds"
]
}
],
"source": [
"from math import isnan\n",
"\n",
"for i, row in X.iterrows():\n",
" x = X['SURFACE'].iloc[i]\n",
" if not isinstance(x,str):\n",
" X.at[i,'SURFACE'] = 0\n",
" elif isinstance(x,str):\n",
" try:\n",
" X.at[i,'SURFACE'] = float(x)\n",
" except:\n",
" X.at[i,'SURFACE'] = 0\n",
" else:\n",
" X.at[i,'SURFACE'] = 0\n",
"\n",
"print(len(list(X.iterrows())))\n",
"X['SURFACE'].values\n",
"#X['DEPT_CODE'].apply(lambda x: str(x))\n",
"#x = X['DEPT_CODE'].iloc[1]\n",
"#isinstance(x,float)"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8 94.0\n",
"9 37.0\n",
"15 35.0\n",
"17 34.0\n",
"18 29.0\n",
" ... \n",
"2346135 244.0\n",
"2346136 244.0\n",
"2346138 190.0\n",
"2346140 210.0\n",
"2346142 171.0\n",
"Name: SURFACE, Length: 919093, dtype: float64"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"nan"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pos = 1 \n",
"if isinstance(X['DEPT_CODE'].iloc[pos], float):\n",
" X['DEPT_CODE'].iloc[pos] = str(int(X['DEPT_CODE'].iloc[pos]))\n",
"X['DEPT_CODE'].iloc[pos]\n",
"X['DEPT_CODE'].iloc[61540]"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'ZIP_CODE': '21000', 'DEPT_CODE': '06', 'CITY': 'Toulouse', 'INSEE_CODE': '31555', 'LATITUDE': 43.6044621670154, 'LONGITUDE': 1.09373271, 'SURFACE': 65.0, 'HEATING_MODE': 'INDIVIDUAL', 'ELEVATOR': True}\n"
]
}
],
"source": [
"train_mode = dict(X_train.mode().iloc[0])\n",
"X_train = X_train.fillna(train_mode)\n",
"print(train_mode)"
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ZIP_CODE', 'DEPT_CODE', 'CITY', 'INSEE_CODE', 'HEATING_MODE']\n",
"['LATITUDE', 'LONGITUDE', 'SURFACE', 'ELEVATOR']\n"
]
}
],
"source": [
"cols = X_train.dtypes == 'object'\n",
"cat_cols = list(cols[cols].index)\n",
"cols = X_train.dtypes != 'object'\n",
"num_cols = list(cols[cols].index)\n",
"print(cat_cols)\n",
"print(num_cols)"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 45. , 29.5, 19. , ..., 190. , 210. , 171. ])"
]
},
"execution_count": 165,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X['SURFACE'].values"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.impute import SimpleImputer"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" after removing the cwd from sys.path.\n",
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self.obj[item] = s\n"
]
}
],
"source": [
"# handle missing values on categorical columns\n",
"imputer = SimpleImputer(strategy='constant')\n",
"X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])\n",
"X_val[cat_cols] = imputer.transform(X_val[cat_cols])"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ZIP_CODE\n",
"DEPT_CODE\n",
"CITY\n",
"INSEE_CODE\n",
"HEATING_MODE\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ZIP_CODE</th>\n",
" <th>DEPT_CODE</th>\n",
" <th>CITY</th>\n",
" <th>INSEE_CODE</th>\n",
" <th>LATITUDE</th>\n",
" <th>LONGITUDE</th>\n",
" <th>SURFACE</th>\n",
" <th>HEATING_MODE</th>\n",
" <th>ELEVATOR</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>117146</th>\n",
" <td>152</td>\n",
" <td>5</td>\n",
" <td>1749</td>\n",
" <td>332</td>\n",
" <td>43.564978</td>\n",
" <td>7.078969</td>\n",
" <td>64.0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>157490</th>\n",
" <td>167</td>\n",
" <td>5</td>\n",
" <td>813</td>\n",
" <td>271</td>\n",
" <td>43.551520</td>\n",
" <td>1.093733</td>\n",
" <td>80.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1139411</th>\n",
" <td>1188</td>\n",
" <td>34</td>\n",
" <td>3856</td>\n",
" <td>1778</td>\n",
" <td>48.111339</td>\n",
" <td>1.093733</td>\n",
" <td>76.0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1840659</th>\n",
" <td>2621</td>\n",
" <td>72</td>\n",
" <td>3267</td>\n",
" <td>3876</td>\n",
" <td>45.430798</td>\n",
" <td>1.093733</td>\n",
" <td>35.5</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1699429</th>\n",
" <td>2180</td>\n",
" <td>62</td>\n",
" <td>1123</td>\n",
" <td>3142</td>\n",
" <td>45.777455</td>\n",
" <td>1.093733</td>\n",
" <td>59.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2265223</th>\n",
" <td>3166</td>\n",
" <td>84</td>\n",
" <td>1060</td>\n",
" <td>4888</td>\n",
" <td>46.505000</td>\n",
" <td>1.093733</td>\n",
" <td>98.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>398123</th>\n",
" <td>536</td>\n",
" <td>16</td>\n",
" <td>646</td>\n",
" <td>732</td>\n",
" <td>47.082277</td>\n",
" <td>2.398566</td>\n",
" <td>63.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>313694</th>\n",
" <td>358</td>\n",
" <td>11</td>\n",
" <td>3611</td>\n",
" <td>520</td>\n",
" <td>43.353061</td>\n",
" <td>5.471240</td>\n",
" <td>61.0</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1157938</th>\n",
" <td>1173</td>\n",
" <td>34</td>\n",
" <td>3856</td>\n",
" <td>1778</td>\n",
" <td>48.111339</td>\n",
" <td>1.093733</td>\n",
" <td>103.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>761889</th>\n",
" <td>954</td>\n",
" <td>30</td>\n",
" <td>4933</td>\n",
" <td>1466</td>\n",
" <td>43.582785</td>\n",
" <td>1.093733</td>\n",
" <td>95.0</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>643365 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" ZIP_CODE DEPT_CODE CITY INSEE_CODE LATITUDE LONGITUDE SURFACE \\\n",
"117146 152 5 1749 332 43.564978 7.078969 64.0 \n",
"157490 167 5 813 271 43.551520 1.093733 80.0 \n",
"1139411 1188 34 3856 1778 48.111339 1.093733 76.0 \n",
"1840659 2621 72 3267 3876 45.430798 1.093733 35.5 \n",
"1699429 2180 62 1123 3142 45.777455 1.093733 59.0 \n",
"... ... ... ... ... ... ... ... \n",
"2265223 3166 84 1060 4888 46.505000 1.093733 98.0 \n",
"398123 536 16 646 732 47.082277 2.398566 63.0 \n",
"313694 358 11 3611 520 43.353061 5.471240 61.0 \n",
"1157938 1173 34 3856 1778 48.111339 1.093733 103.0 \n",
"761889 954 30 4933 1466 43.582785 1.093733 95.0 \n",
"\n",
" HEATING_MODE ELEVATOR \n",
"117146 0 True \n",
"157490 1 True \n",
"1139411 0 True \n",
"1840659 1 True \n",
"1699429 1 True \n",
"... ... ... \n",
"2265223 1 True \n",
"398123 1 True \n",
"313694 0 True \n",
"1157938 1 True \n",
"761889 1 True \n",
"\n",
"[643365 rows x 9 columns]"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.preprocessing import LabelEncoder\n",
"\n",
"#encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
"labEnc = LabelEncoder()\n",
"\n",
"for col in cat_cols:\n",
" X_train[col] = labEnc.fit_transform(X_train[col])\n",
"# X_val[col] = labEnc.transform(X_val[col])\n",
"\n",
"X_train\n",
"\n",
"#OH_X_train = pd.DataFrame(encoder.fit_transform(X_train[cat_cols]))\n",
"#OH_X_val = encoder.transform(X_val[cat_cols])\n",
"\n",
"#OH_X_train"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" after removing the cwd from sys.path.\n",
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self.obj[item] = s\n"
]
}
],
"source": [
"num_imputer = SimpleImputer(strategy='mean')\n",
"\n",
"X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])\n",
"X_val[num_cols] = num_imputer.transform(X_val[num_cols])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#from sklearn.metrics import mean_absolute_error pour evaluer erreur\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [],
"source": [
"lin_reg = LinearRegression().fit(X_train,y_train)"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import mean_absolute_error"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: 'carcassonne'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-175-b23f0d3c285a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpreds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlin_reg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0mReturns\u001b[0m \u001b[0mpredicted\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 235\u001b[0m \"\"\"\n\u001b[0;32m--> 236\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decision_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0m_preprocess_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstaticmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_preprocess_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36m_decision_function\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 218\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'coo'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 219\u001b[0m return safe_sparse_dot(X, self.coef_.T,\n\u001b[1;32m 220\u001b[0m dense_output=True) + self.intercept_\n",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 71\u001b[0m FutureWarning)\n\u001b[1;32m 72\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcasting\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"unsafe\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 600\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 601\u001b[0m raise ValueError(\"Complex data not supported\\n\"\n",
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/numpy/core/_asarray.py\u001b[0m in \u001b[0;36masarray\u001b[0;34m(a, dtype, order)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \"\"\"\n\u001b[0;32m---> 85\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'carcassonne'"
]
}
],
"source": [
"preds = lin_reg.predict(X_val)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.7 64-bit ('.env': venv)",
"language": "python",
"name": "python37764bitenvvenv28a5e1c45c144c06bbb2a58e0b29bacb"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}