mirror of
https://github.com/rjNemo/copro-api
synced 2026-06-06 02:16:44 +00:00
1568 lines
64 KiB
Text
1568 lines
64 KiB
Text
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Data Modelling"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Data Exploration"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (13) have mixed types.Specify dtype option on import or set low_memory=False.\n",
|
||
" interactivity=interactivity, compiler=compiler, result=result)\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ID</th>\n",
|
||
" <th>LATITUDE</th>\n",
|
||
" <th>LONGITUDE</th>\n",
|
||
" <th>BLUR_RADIUS</th>\n",
|
||
" <th>PRICE</th>\n",
|
||
" <th>CONDOMINIUM_EXPENSES</th>\n",
|
||
" <th>WATER_HEATING_MODE</th>\n",
|
||
" <th>FLOOR</th>\n",
|
||
" <th>FLOOR_COUNT</th>\n",
|
||
" <th>LOT_COUNT</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>TERRACE</th>\n",
|
||
" <th>TERRACE_SURFACE</th>\n",
|
||
" <th>SWIMMING_POOL</th>\n",
|
||
" <th>GARDEN</th>\n",
|
||
" <th>STANDING</th>\n",
|
||
" <th>SMALL_BUILDING</th>\n",
|
||
" <th>CORNER_BUILDING</th>\n",
|
||
" <th>PUBLICATION_START_DATE</th>\n",
|
||
" <th>DEALER_NAME</th>\n",
|
||
" <th>DEALER_TYPE</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>count</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>2.346152e+06</td>\n",
|
||
" <td>1.222870e+06</td>\n",
|
||
" <td>1.221350e+06</td>\n",
|
||
" <td>2.346125e+06</td>\n",
|
||
" <td>9.259670e+05</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>1.729398e+06</td>\n",
|
||
" <td>1.076405e+06</td>\n",
|
||
" <td>1.014007e+06</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>mean</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4.573629e+01</td>\n",
|
||
" <td>2.913308e+00</td>\n",
|
||
" <td>4.661755e+02</td>\n",
|
||
" <td>2.338131e+05</td>\n",
|
||
" <td>1.211348e+04</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2.163156e+00</td>\n",
|
||
" <td>9.598156e+00</td>\n",
|
||
" <td>9.701759e+05</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>std</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6.898158e+00</td>\n",
|
||
" <td>5.784470e+00</td>\n",
|
||
" <td>3.041044e+02</td>\n",
|
||
" <td>2.942130e+05</td>\n",
|
||
" <td>4.006808e+06</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>7.338205e+00</td>\n",
|
||
" <td>5.448560e+03</td>\n",
|
||
" <td>7.661597e+08</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>min</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-2.138532e+01</td>\n",
|
||
" <td>-6.177292e+01</td>\n",
|
||
" <td>5.000000e+01</td>\n",
|
||
" <td>1.000000e+00</td>\n",
|
||
" <td>1.000000e-02</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>-9.000000e+00</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25%</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4.362788e+01</td>\n",
|
||
" <td>1.408058e+00</td>\n",
|
||
" <td>2.500000e+02</td>\n",
|
||
" <td>1.120000e+05</td>\n",
|
||
" <td>7.200000e+02</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>1.000000e+00</td>\n",
|
||
" <td>2.000000e+00</td>\n",
|
||
" <td>1.500000e+01</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>50%</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4.649788e+01</td>\n",
|
||
" <td>2.441038e+00</td>\n",
|
||
" <td>5.000000e+02</td>\n",
|
||
" <td>1.690000e+05</td>\n",
|
||
" <td>1.200000e+03</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2.000000e+00</td>\n",
|
||
" <td>4.000000e+00</td>\n",
|
||
" <td>4.300000e+01</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>75%</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4.883240e+01</td>\n",
|
||
" <td>5.383308e+00</td>\n",
|
||
" <td>7.500000e+02</td>\n",
|
||
" <td>2.590000e+05</td>\n",
|
||
" <td>2.000000e+03</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>3.000000e+00</td>\n",
|
||
" <td>5.000000e+00</td>\n",
|
||
" <td>1.030000e+02</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>max</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>5.108570e+01</td>\n",
|
||
" <td>5.572088e+01</td>\n",
|
||
" <td>1.000000e+03</td>\n",
|
||
" <td>7.500000e+07</td>\n",
|
||
" <td>1.920350e+09</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>2.019000e+03</td>\n",
|
||
" <td>5.652800e+06</td>\n",
|
||
" <td>7.610212e+11</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>8 rows × 22 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ID LATITUDE LONGITUDE BLUR_RADIUS PRICE \\\n",
|
||
"count 0.0 2.346152e+06 1.222870e+06 1.221350e+06 2.346125e+06 \n",
|
||
"mean NaN 4.573629e+01 2.913308e+00 4.661755e+02 2.338131e+05 \n",
|
||
"std NaN 6.898158e+00 5.784470e+00 3.041044e+02 2.942130e+05 \n",
|
||
"min NaN -2.138532e+01 -6.177292e+01 5.000000e+01 1.000000e+00 \n",
|
||
"25% NaN 4.362788e+01 1.408058e+00 2.500000e+02 1.120000e+05 \n",
|
||
"50% NaN 4.649788e+01 2.441038e+00 5.000000e+02 1.690000e+05 \n",
|
||
"75% NaN 4.883240e+01 5.383308e+00 7.500000e+02 2.590000e+05 \n",
|
||
"max NaN 5.108570e+01 5.572088e+01 1.000000e+03 7.500000e+07 \n",
|
||
"\n",
|
||
" CONDOMINIUM_EXPENSES WATER_HEATING_MODE FLOOR FLOOR_COUNT \\\n",
|
||
"count 9.259670e+05 0.0 1.729398e+06 1.076405e+06 \n",
|
||
"mean 1.211348e+04 NaN 2.163156e+00 9.598156e+00 \n",
|
||
"std 4.006808e+06 NaN 7.338205e+00 5.448560e+03 \n",
|
||
"min 1.000000e-02 NaN -9.000000e+00 0.000000e+00 \n",
|
||
"25% 7.200000e+02 NaN 1.000000e+00 2.000000e+00 \n",
|
||
"50% 1.200000e+03 NaN 2.000000e+00 4.000000e+00 \n",
|
||
"75% 2.000000e+03 NaN 3.000000e+00 5.000000e+00 \n",
|
||
"max 1.920350e+09 NaN 2.019000e+03 5.652800e+06 \n",
|
||
"\n",
|
||
" LOT_COUNT ... TERRACE TERRACE_SURFACE SWIMMING_POOL GARDEN \\\n",
|
||
"count 1.014007e+06 ... 0.0 0.0 0.0 0.0 \n",
|
||
"mean 9.701759e+05 ... NaN NaN NaN NaN \n",
|
||
"std 7.661597e+08 ... NaN NaN NaN NaN \n",
|
||
"min 0.000000e+00 ... NaN NaN NaN NaN \n",
|
||
"25% 1.500000e+01 ... NaN NaN NaN NaN \n",
|
||
"50% 4.300000e+01 ... NaN NaN NaN NaN \n",
|
||
"75% 1.030000e+02 ... NaN NaN NaN NaN \n",
|
||
"max 7.610212e+11 ... NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" STANDING SMALL_BUILDING CORNER_BUILDING PUBLICATION_START_DATE \\\n",
|
||
"count 0.0 0.0 0.0 0.0 \n",
|
||
"mean NaN NaN NaN NaN \n",
|
||
"std NaN NaN NaN NaN \n",
|
||
"min NaN NaN NaN NaN \n",
|
||
"25% NaN NaN NaN NaN \n",
|
||
"50% NaN NaN NaN NaN \n",
|
||
"75% NaN NaN NaN NaN \n",
|
||
"max NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" DEALER_NAME DEALER_TYPE \n",
|
||
"count 0.0 0.0 \n",
|
||
"mean NaN NaN \n",
|
||
"std NaN NaN \n",
|
||
"min NaN NaN \n",
|
||
"25% NaN NaN \n",
|
||
"50% NaN NaN \n",
|
||
"75% NaN NaN \n",
|
||
"max NaN NaN \n",
|
||
"\n",
|
||
"[8 rows x 22 columns]"
|
||
]
|
||
},
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data_path = 'annonces.csv'\n",
|
||
"raw_data = pd.read_csv(data_path,dtype={'DEPT_CODE': str, 'ZIP_CODE': str, 'INSEE_CODE': str})\n",
|
||
"raw_data.describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Le dataset contient 2346152 entrées et 39 colonnes."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"The aim is to predict CONDOMINIUM_EXPENSES so this is the target."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 100,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ID</th>\n",
|
||
" <th>AD_URLS</th>\n",
|
||
" <th>PROPERTY_TYPE</th>\n",
|
||
" <th>DEPT_CODE</th>\n",
|
||
" <th>ZIP_CODE</th>\n",
|
||
" <th>CITY</th>\n",
|
||
" <th>INSEE_CODE</th>\n",
|
||
" <th>LATITUDE</th>\n",
|
||
" <th>LONGITUDE</th>\n",
|
||
" <th>BLUR_RADIUS</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>GARDEN</th>\n",
|
||
" <th>STANDING</th>\n",
|
||
" <th>NEW_BUILD</th>\n",
|
||
" <th>SMALL_BUILDING</th>\n",
|
||
" <th>CORNER_BUILDING</th>\n",
|
||
" <th>PUBLICATION_START_DATE</th>\n",
|
||
" <th>DEALER_NAME</th>\n",
|
||
" <th>DEALER_TYPE</th>\n",
|
||
" <th>REFERENCE_NUMBER</th>\n",
|
||
" <th>ENERGY_CLASSIFICATION</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/adapt-immo-850...</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>01750</td>\n",
|
||
" <td>Saint-Laurent-sur-Saône</td>\n",
|
||
" <td>01370</td>\n",
|
||
" <td>46.310072</td>\n",
|
||
" <td>4.841813</td>\n",
|
||
" <td>1000.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>8500236963</td>\n",
|
||
" <td>NS</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/orpi-1-099934E...</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>01750</td>\n",
|
||
" <td>Saint-Laurent-sur-Saône</td>\n",
|
||
" <td>01370</td>\n",
|
||
" <td>46.304441</td>\n",
|
||
" <td>4.841141</td>\n",
|
||
" <td>50.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>099934E0KVCC</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/fnaim-39964032</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>01000</td>\n",
|
||
" <td>Bourg-en-Bresse</td>\n",
|
||
" <td>01053</td>\n",
|
||
" <td>46.205119</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>25871</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/ag752451-11061...</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>01300</td>\n",
|
||
" <td>Belley</td>\n",
|
||
" <td>01034</td>\n",
|
||
" <td>45.742145</td>\n",
|
||
" <td>5.686873</td>\n",
|
||
" <td>1000.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>421601048</td>\n",
|
||
" <td>A</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/ag752451-11278...</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>01140</td>\n",
|
||
" <td>Thoissey</td>\n",
|
||
" <td>01420</td>\n",
|
||
" <td>46.169511</td>\n",
|
||
" <td>4.784252</td>\n",
|
||
" <td>1000.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>421601005</td>\n",
|
||
" <td>C</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2346135</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/fnaim-37667773</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>78</td>\n",
|
||
" <td>78100</td>\n",
|
||
" <td>Saint-Germain-en-Laye</td>\n",
|
||
" <td>78551</td>\n",
|
||
" <td>48.899041</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>7000</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2346136</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/fnaim-38342524</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>78</td>\n",
|
||
" <td>78100</td>\n",
|
||
" <td>Saint-Germain-en-Laye</td>\n",
|
||
" <td>78551</td>\n",
|
||
" <td>48.899041</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>7000</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2346138</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/pericles-15861344</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>78</td>\n",
|
||
" <td>78100</td>\n",
|
||
" <td>Saint-Germain-en-Laye</td>\n",
|
||
" <td>78551</td>\n",
|
||
" <td>48.897947</td>\n",
|
||
" <td>2.100340</td>\n",
|
||
" <td>50.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>15861344</td>\n",
|
||
" <td>E</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2346140</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/immo-facile-22...</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>78</td>\n",
|
||
" <td>78100</td>\n",
|
||
" <td>Saint-Germain-en-Laye</td>\n",
|
||
" <td>78551</td>\n",
|
||
" <td>48.895068</td>\n",
|
||
" <td>2.096111</td>\n",
|
||
" <td>250.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>3984</td>\n",
|
||
" <td>D</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2346142</th>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>https://www.bienici.com/annonce/snpi-1106555</td>\n",
|
||
" <td>APARTMENT</td>\n",
|
||
" <td>78</td>\n",
|
||
" <td>78000</td>\n",
|
||
" <td>Versailles</td>\n",
|
||
" <td>78646</td>\n",
|
||
" <td>48.803540</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>AZ1-2014</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>919093 rows × 39 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ID AD_URLS PROPERTY_TYPE \\\n",
|
||
"8 NaN https://www.bienici.com/annonce/adapt-immo-850... APARTMENT \n",
|
||
"9 NaN https://www.bienici.com/annonce/orpi-1-099934E... APARTMENT \n",
|
||
"15 NaN https://www.bienici.com/annonce/fnaim-39964032 APARTMENT \n",
|
||
"17 NaN https://www.bienici.com/annonce/ag752451-11061... APARTMENT \n",
|
||
"18 NaN https://www.bienici.com/annonce/ag752451-11278... APARTMENT \n",
|
||
"... .. ... ... \n",
|
||
"2346135 NaN https://www.bienici.com/annonce/fnaim-37667773 APARTMENT \n",
|
||
"2346136 NaN https://www.bienici.com/annonce/fnaim-38342524 APARTMENT \n",
|
||
"2346138 NaN https://www.bienici.com/annonce/pericles-15861344 APARTMENT \n",
|
||
"2346140 NaN https://www.bienici.com/annonce/immo-facile-22... APARTMENT \n",
|
||
"2346142 NaN https://www.bienici.com/annonce/snpi-1106555 APARTMENT \n",
|
||
"\n",
|
||
" DEPT_CODE ZIP_CODE CITY INSEE_CODE LATITUDE \\\n",
|
||
"8 01 01750 Saint-Laurent-sur-Saône 01370 46.310072 \n",
|
||
"9 01 01750 Saint-Laurent-sur-Saône 01370 46.304441 \n",
|
||
"15 01 01000 Bourg-en-Bresse 01053 46.205119 \n",
|
||
"17 01 01300 Belley 01034 45.742145 \n",
|
||
"18 01 01140 Thoissey 01420 46.169511 \n",
|
||
"... ... ... ... ... ... \n",
|
||
"2346135 78 78100 Saint-Germain-en-Laye 78551 48.899041 \n",
|
||
"2346136 78 78100 Saint-Germain-en-Laye 78551 48.899041 \n",
|
||
"2346138 78 78100 Saint-Germain-en-Laye 78551 48.897947 \n",
|
||
"2346140 78 78100 Saint-Germain-en-Laye 78551 48.895068 \n",
|
||
"2346142 78 78000 Versailles 78646 48.803540 \n",
|
||
"\n",
|
||
" LONGITUDE BLUR_RADIUS ... GARDEN STANDING NEW_BUILD \\\n",
|
||
"8 4.841813 1000.0 ... NaN NaN False \n",
|
||
"9 4.841141 50.0 ... NaN NaN False \n",
|
||
"15 NaN NaN ... NaN NaN False \n",
|
||
"17 5.686873 1000.0 ... NaN NaN False \n",
|
||
"18 4.784252 1000.0 ... NaN NaN False \n",
|
||
"... ... ... ... ... ... ... \n",
|
||
"2346135 NaN NaN ... NaN NaN False \n",
|
||
"2346136 NaN NaN ... NaN NaN False \n",
|
||
"2346138 2.100340 50.0 ... NaN NaN False \n",
|
||
"2346140 2.096111 250.0 ... NaN NaN False \n",
|
||
"2346142 NaN NaN ... NaN NaN False \n",
|
||
"\n",
|
||
" SMALL_BUILDING CORNER_BUILDING PUBLICATION_START_DATE DEALER_NAME \\\n",
|
||
"8 NaN NaN NaN NaN \n",
|
||
"9 NaN NaN NaN NaN \n",
|
||
"15 NaN NaN NaN NaN \n",
|
||
"17 NaN NaN NaN NaN \n",
|
||
"18 NaN NaN NaN NaN \n",
|
||
"... ... ... ... ... \n",
|
||
"2346135 NaN NaN NaN NaN \n",
|
||
"2346136 NaN NaN NaN NaN \n",
|
||
"2346138 NaN NaN NaN NaN \n",
|
||
"2346140 NaN NaN NaN NaN \n",
|
||
"2346142 NaN NaN NaN NaN \n",
|
||
"\n",
|
||
" DEALER_TYPE REFERENCE_NUMBER ENERGY_CLASSIFICATION \n",
|
||
"8 NaN 8500236963 NS \n",
|
||
"9 NaN 099934E0KVCC NaN \n",
|
||
"15 NaN 25871 NaN \n",
|
||
"17 NaN 421601048 A \n",
|
||
"18 NaN 421601005 C \n",
|
||
"... ... ... ... \n",
|
||
"2346135 NaN 7000 D \n",
|
||
"2346136 NaN 7000 D \n",
|
||
"2346138 NaN 15861344 E \n",
|
||
"2346140 NaN 3984 D \n",
|
||
"2346142 NaN AZ1-2014 NaN \n",
|
||
"\n",
|
||
"[919093 rows x 39 columns]"
|
||
]
|
||
},
|
||
"execution_count": 100,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# drop rows without CONDOMINIUM_EXPENSES, DEPT_CODE, ZIP_CODE, CITY\n",
|
||
"feature_cols = ['CONDOMINIUM_EXPENSES', 'DEPT_CODE', 'ZIP_CODE', 'CITY','SURFACE']\n",
|
||
"data = raw_data.dropna(subset=feature_cols)\n",
|
||
"data.describe()\n",
|
||
"data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"925967 labelled entries"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 155,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Missing values per feature:\n",
|
||
"ZIP_CODE:\t0\n",
|
||
"DEPT_CODE:\t0\n",
|
||
"CITY:\t0\n",
|
||
"INSEE_CODE:\t0\n",
|
||
"LATITUDE:\t0\n",
|
||
"LONGITUDE:\t335521\n",
|
||
"SURFACE:\t0\n",
|
||
"HEATING_MODE:\t550030\n",
|
||
"ELEVATOR:\t545593\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"NaNs = [f\"{col}:\\t{data[col].isnull().sum()}\" for col in data.columns]\n",
|
||
"print(\"Missing values per feature:\")\n",
|
||
"print(\"\\n\".join(NaNs))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Training step: baseline"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 103,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Given missing values we will consider only these columns\n",
|
||
"#cols = ['PROPERTY_TYPE', 'DEPT_CODE', 'ZIP_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE', 'LONGITUDE', \n",
|
||
"# 'MARKETING_TYPE', 'PRICE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR', 'FLOOR', 'PARKING', \n",
|
||
"# 'PARKING_COUNT', 'NEW_BUILD', 'ENERGY_CLASSIFICATION'] \n",
|
||
"cols = ['ZIP_CODE','DEPT_CODE', 'CITY', 'INSEE_CODE', 'LATITUDE', 'LONGITUDE', 'SURFACE', 'HEATING_MODE', 'ELEVATOR'] "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 104,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" This is separate from the ipykernel package so we can avoid doing imports until\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ZIP_CODE</th>\n",
|
||
" <th>DEPT_CODE</th>\n",
|
||
" <th>CITY</th>\n",
|
||
" <th>INSEE_CODE</th>\n",
|
||
" <th>LATITUDE</th>\n",
|
||
" <th>LONGITUDE</th>\n",
|
||
" <th>SURFACE</th>\n",
|
||
" <th>HEATING_MODE</th>\n",
|
||
" <th>ELEVATOR</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>01750</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>Saint-Laurent-sur-Saône</td>\n",
|
||
" <td>01370</td>\n",
|
||
" <td>46.310072</td>\n",
|
||
" <td>4.841813</td>\n",
|
||
" <td>45.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>01750</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>Saint-Laurent-sur-Saône</td>\n",
|
||
" <td>01370</td>\n",
|
||
" <td>46.304441</td>\n",
|
||
" <td>4.841141</td>\n",
|
||
" <td>29.5</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>01000</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>Bourg-en-Bresse</td>\n",
|
||
" <td>01053</td>\n",
|
||
" <td>46.205119</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>19.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>01300</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>Belley</td>\n",
|
||
" <td>01034</td>\n",
|
||
" <td>45.742145</td>\n",
|
||
" <td>5.686873</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>False</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>01140</td>\n",
|
||
" <td>01</td>\n",
|
||
" <td>Thoissey</td>\n",
|
||
" <td>01420</td>\n",
|
||
" <td>46.169511</td>\n",
|
||
" <td>4.784252</td>\n",
|
||
" <td>68.0</td>\n",
|
||
" <td>COLLECTIVE</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ZIP_CODE DEPT_CODE CITY INSEE_CODE LATITUDE \\\n",
|
||
"8 01750 01 Saint-Laurent-sur-Saône 01370 46.310072 \n",
|
||
"9 01750 01 Saint-Laurent-sur-Saône 01370 46.304441 \n",
|
||
"15 01000 01 Bourg-en-Bresse 01053 46.205119 \n",
|
||
"17 01300 01 Belley 01034 45.742145 \n",
|
||
"18 01140 01 Thoissey 01420 46.169511 \n",
|
||
"\n",
|
||
" LONGITUDE SURFACE HEATING_MODE ELEVATOR \n",
|
||
"8 4.841813 45.0 NaN NaN \n",
|
||
"9 4.841141 29.5 NaN NaN \n",
|
||
"15 NaN 19.0 NaN NaN \n",
|
||
"17 5.686873 59.0 NaN False \n",
|
||
"18 4.784252 68.0 COLLECTIVE NaN "
|
||
]
|
||
},
|
||
"execution_count": 104,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"y = data['CONDOMINIUM_EXPENSES']\n",
|
||
"X = data[cols]\n",
|
||
"X['SURFACE'] = X['SURFACE'].astype(float)\n",
|
||
"X.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 105,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[nan 'COLLECTIVE' 'INDIVIDUAL']\n",
|
||
"[nan False True]\n",
|
||
"[ 45. 29.5 19. ... 208.82 195.8 213.43]\n",
|
||
"['01' '02' '03' '04' '06' '07' '08' '09' '93' '11' '12' '67' '13' '14'\n",
|
||
" '15' '17' '16' '18' '19' '2A' '21' '22' '23' '79' '24' '25' '26' '91'\n",
|
||
" '28' '27' '29' '30' '32' '33' '97' '2B' '31' '43' '52' '05' '70' '74'\n",
|
||
" '65' '87' '68' '92' '34' '35' '37' '36' '38' '39' '40' '44' '42' '41'\n",
|
||
" '45' '47' '46' '48' '49' '50' '51' '53' '54' '55' '56' '57' '58' '59'\n",
|
||
" '60' '61' '75' '62' '63' '64' '66' '69' '71' '72' '73' '77' '76' '80'\n",
|
||
" '82' '81' '90' '94' '95' '83' '84' '85' '88' '89' '78']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(X['HEATING_MODE'].unique())\n",
|
||
"print(X['ELEVATOR'].unique())\n",
|
||
"print(X['SURFACE'].unique())\n",
|
||
"print(X['DEPT_CODE'].unique())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 79,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"ename": "IndexError",
|
||
"evalue": "single positional indexer is out-of-bounds",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
|
||
"\u001b[0;32m<ipython-input-79-9b684df73459>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'SURFACE'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mat\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'SURFACE'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 1766\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1767\u001b[0m \u001b[0mmaybe_callable\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1768\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmaybe_callable\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1769\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1770\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 2136\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[0;31m# validate the location\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2138\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2139\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2140\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_validate_integer\u001b[0;34m(self, key, axis)\u001b[0m\n\u001b[1;32m 2061\u001b[0m \u001b[0mlen_axis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_axis\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2062\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m>=\u001b[0m \u001b[0mlen_axis\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mkey\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mlen_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2063\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"single positional indexer is out-of-bounds\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2064\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2065\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_tuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtup\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTuple\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;31mIndexError\u001b[0m: single positional indexer is out-of-bounds"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from math import isnan\n",
|
||
"\n",
|
||
"for i, row in X.iterrows():\n",
|
||
" x = X['SURFACE'].iloc[i]\n",
|
||
" if not isinstance(x,str):\n",
|
||
" X.at[i,'SURFACE'] = 0\n",
|
||
" elif isinstance(x,str):\n",
|
||
" try:\n",
|
||
" X.at[i,'SURFACE'] = float(x)\n",
|
||
" except:\n",
|
||
" X.at[i,'SURFACE'] = 0\n",
|
||
" else:\n",
|
||
" X.at[i,'SURFACE'] = 0\n",
|
||
"\n",
|
||
"print(len(list(X.iterrows())))\n",
|
||
"X['SURFACE'].values\n",
|
||
"#X['DEPT_CODE'].apply(lambda x: str(x))\n",
|
||
"#x = X['DEPT_CODE'].iloc[1]\n",
|
||
"#isinstance(x,float)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 83,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"8 94.0\n",
|
||
"9 37.0\n",
|
||
"15 35.0\n",
|
||
"17 34.0\n",
|
||
"18 29.0\n",
|
||
" ... \n",
|
||
"2346135 244.0\n",
|
||
"2346136 244.0\n",
|
||
"2346138 190.0\n",
|
||
"2346140 210.0\n",
|
||
"2346142 171.0\n",
|
||
"Name: SURFACE, Length: 919093, dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 83,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"nan"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"pos = 1 \n",
|
||
"if isinstance(X['DEPT_CODE'].iloc[pos], float):\n",
|
||
" X['DEPT_CODE'].iloc[pos] = str(int(X['DEPT_CODE'].iloc[pos]))\n",
|
||
"X['DEPT_CODE'].iloc[pos]\n",
|
||
"X['DEPT_CODE'].iloc[61540]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 91,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 162,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 163,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"{'ZIP_CODE': '21000', 'DEPT_CODE': '06', 'CITY': 'Toulouse', 'INSEE_CODE': '31555', 'LATITUDE': 43.6044621670154, 'LONGITUDE': 1.09373271, 'SURFACE': 65.0, 'HEATING_MODE': 'INDIVIDUAL', 'ELEVATOR': True}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"train_mode = dict(X_train.mode().iloc[0])\n",
|
||
"X_train = X_train.fillna(train_mode)\n",
|
||
"print(train_mode)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 164,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"['ZIP_CODE', 'DEPT_CODE', 'CITY', 'INSEE_CODE', 'HEATING_MODE']\n",
|
||
"['LATITUDE', 'LONGITUDE', 'SURFACE', 'ELEVATOR']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"cols = X_train.dtypes == 'object'\n",
|
||
"cat_cols = list(cols[cols].index)\n",
|
||
"cols = X_train.dtypes != 'object'\n",
|
||
"num_cols = list(cols[cols].index)\n",
|
||
"print(cat_cols)\n",
|
||
"print(num_cols)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 165,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([ 45. , 29.5, 19. , ..., 190. , 210. , 171. ])"
|
||
]
|
||
},
|
||
"execution_count": 165,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X['SURFACE'].values"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 166,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.impute import SimpleImputer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 167,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" after removing the cwd from sys.path.\n",
|
||
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" self.obj[item] = s\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# handle missing values on categorical columns\n",
|
||
"imputer = SimpleImputer(strategy='constant')\n",
|
||
"X_train[cat_cols] = imputer.fit_transform(X_train[cat_cols])\n",
|
||
"X_val[cat_cols] = imputer.transform(X_val[cat_cols])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 168,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"ZIP_CODE\n",
|
||
"DEPT_CODE\n",
|
||
"CITY\n",
|
||
"INSEE_CODE\n",
|
||
"HEATING_MODE\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ZIP_CODE</th>\n",
|
||
" <th>DEPT_CODE</th>\n",
|
||
" <th>CITY</th>\n",
|
||
" <th>INSEE_CODE</th>\n",
|
||
" <th>LATITUDE</th>\n",
|
||
" <th>LONGITUDE</th>\n",
|
||
" <th>SURFACE</th>\n",
|
||
" <th>HEATING_MODE</th>\n",
|
||
" <th>ELEVATOR</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>117146</th>\n",
|
||
" <td>152</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1749</td>\n",
|
||
" <td>332</td>\n",
|
||
" <td>43.564978</td>\n",
|
||
" <td>7.078969</td>\n",
|
||
" <td>64.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>157490</th>\n",
|
||
" <td>167</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>813</td>\n",
|
||
" <td>271</td>\n",
|
||
" <td>43.551520</td>\n",
|
||
" <td>1.093733</td>\n",
|
||
" <td>80.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1139411</th>\n",
|
||
" <td>1188</td>\n",
|
||
" <td>34</td>\n",
|
||
" <td>3856</td>\n",
|
||
" <td>1778</td>\n",
|
||
" <td>48.111339</td>\n",
|
||
" <td>1.093733</td>\n",
|
||
" <td>76.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1840659</th>\n",
|
||
" <td>2621</td>\n",
|
||
" <td>72</td>\n",
|
||
" <td>3267</td>\n",
|
||
" <td>3876</td>\n",
|
||
" <td>45.430798</td>\n",
|
||
" <td>1.093733</td>\n",
|
||
" <td>35.5</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1699429</th>\n",
|
||
" <td>2180</td>\n",
|
||
" <td>62</td>\n",
|
||
" <td>1123</td>\n",
|
||
" <td>3142</td>\n",
|
||
" <td>45.777455</td>\n",
|
||
" <td>1.093733</td>\n",
|
||
" <td>59.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2265223</th>\n",
|
||
" <td>3166</td>\n",
|
||
" <td>84</td>\n",
|
||
" <td>1060</td>\n",
|
||
" <td>4888</td>\n",
|
||
" <td>46.505000</td>\n",
|
||
" <td>1.093733</td>\n",
|
||
" <td>98.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>398123</th>\n",
|
||
" <td>536</td>\n",
|
||
" <td>16</td>\n",
|
||
" <td>646</td>\n",
|
||
" <td>732</td>\n",
|
||
" <td>47.082277</td>\n",
|
||
" <td>2.398566</td>\n",
|
||
" <td>63.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>313694</th>\n",
|
||
" <td>358</td>\n",
|
||
" <td>11</td>\n",
|
||
" <td>3611</td>\n",
|
||
" <td>520</td>\n",
|
||
" <td>43.353061</td>\n",
|
||
" <td>5.471240</td>\n",
|
||
" <td>61.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1157938</th>\n",
|
||
" <td>1173</td>\n",
|
||
" <td>34</td>\n",
|
||
" <td>3856</td>\n",
|
||
" <td>1778</td>\n",
|
||
" <td>48.111339</td>\n",
|
||
" <td>1.093733</td>\n",
|
||
" <td>103.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>761889</th>\n",
|
||
" <td>954</td>\n",
|
||
" <td>30</td>\n",
|
||
" <td>4933</td>\n",
|
||
" <td>1466</td>\n",
|
||
" <td>43.582785</td>\n",
|
||
" <td>1.093733</td>\n",
|
||
" <td>95.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>True</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>643365 rows × 9 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ZIP_CODE DEPT_CODE CITY INSEE_CODE LATITUDE LONGITUDE SURFACE \\\n",
|
||
"117146 152 5 1749 332 43.564978 7.078969 64.0 \n",
|
||
"157490 167 5 813 271 43.551520 1.093733 80.0 \n",
|
||
"1139411 1188 34 3856 1778 48.111339 1.093733 76.0 \n",
|
||
"1840659 2621 72 3267 3876 45.430798 1.093733 35.5 \n",
|
||
"1699429 2180 62 1123 3142 45.777455 1.093733 59.0 \n",
|
||
"... ... ... ... ... ... ... ... \n",
|
||
"2265223 3166 84 1060 4888 46.505000 1.093733 98.0 \n",
|
||
"398123 536 16 646 732 47.082277 2.398566 63.0 \n",
|
||
"313694 358 11 3611 520 43.353061 5.471240 61.0 \n",
|
||
"1157938 1173 34 3856 1778 48.111339 1.093733 103.0 \n",
|
||
"761889 954 30 4933 1466 43.582785 1.093733 95.0 \n",
|
||
"\n",
|
||
" HEATING_MODE ELEVATOR \n",
|
||
"117146 0 True \n",
|
||
"157490 1 True \n",
|
||
"1139411 0 True \n",
|
||
"1840659 1 True \n",
|
||
"1699429 1 True \n",
|
||
"... ... ... \n",
|
||
"2265223 1 True \n",
|
||
"398123 1 True \n",
|
||
"313694 0 True \n",
|
||
"1157938 1 True \n",
|
||
"761889 1 True \n",
|
||
"\n",
|
||
"[643365 rows x 9 columns]"
|
||
]
|
||
},
|
||
"execution_count": 168,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"#from sklearn.preprocessing import OneHotEncoder\n",
|
||
"from sklearn.preprocessing import LabelEncoder\n",
|
||
"\n",
|
||
"#encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
|
||
"labEnc = LabelEncoder()\n",
|
||
"\n",
|
||
"for col in cat_cols:\n",
|
||
" X_train[col] = labEnc.fit_transform(X_train[col])\n",
|
||
"# X_val[col] = labEnc.transform(X_val[col])\n",
|
||
"\n",
|
||
"X_train\n",
|
||
"\n",
|
||
"#OH_X_train = pd.DataFrame(encoder.fit_transform(X_train[cat_cols]))\n",
|
||
"#OH_X_val = encoder.transform(X_val[cat_cols])\n",
|
||
"\n",
|
||
"#OH_X_train"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 169,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" after removing the cwd from sys.path.\n",
|
||
"/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: \n",
|
||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||
"\n",
|
||
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
||
" self.obj[item] = s\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"num_imputer = SimpleImputer(strategy='mean')\n",
|
||
"\n",
|
||
"X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])\n",
|
||
"X_val[num_cols] = num_imputer.transform(X_val[num_cols])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"#from sklearn.metrics import mean_absolute_error pour evaluer erreur\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 170,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.linear_model import LinearRegression"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 171,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"lin_reg = LinearRegression().fit(X_train,y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 173,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.metrics import mean_absolute_error"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 175,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "ValueError",
|
||
"evalue": "could not convert string to float: 'carcassonne'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
||
"\u001b[0;32m<ipython-input-175-b23f0d3c285a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpreds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlin_reg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36mpredict\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0mReturns\u001b[0m \u001b[0mpredicted\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 235\u001b[0m \"\"\"\n\u001b[0;32m--> 236\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_decision_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 237\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[0m_preprocess_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstaticmethod\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_preprocess_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/linear_model/_base.py\u001b[0m in \u001b[0;36m_decision_function\u001b[0;34m(self, X)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0mcheck_is_fitted\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 218\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'csc'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'coo'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 219\u001b[0m return safe_sparse_dot(X, self.coef_.T,\n\u001b[1;32m 220\u001b[0m dense_output=True) + self.intercept_\n",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 71\u001b[0m FutureWarning)\n\u001b[1;32m 72\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcasting\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"unsafe\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0masarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 600\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mComplexWarning\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 601\u001b[0m raise ValueError(\"Complex data not supported\\n\"\n",
|
||
"\u001b[0;32m~/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/numpy/core/_asarray.py\u001b[0m in \u001b[0;36masarray\u001b[0;34m(a, dtype, order)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 84\u001b[0m \"\"\"\n\u001b[0;32m---> 85\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 86\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'carcassonne'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"preds = lin_reg.predict(X_val)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3.7.7 64-bit ('.env': venv)",
|
||
"language": "python",
|
||
"name": "python37764bitenvvenv28a5e1c45c144c06bbb2a58e0b29bacb"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.7"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|