{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Modelling" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Exploration" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/nemausat/Dev/Python/Challenges_projects/fake_news/.env/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3063: DtypeWarning: Columns (13) have mixed types.Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "data": { "text/html": [ "
| \n", " | ID | \n", "LATITUDE | \n", "LONGITUDE | \n", "BLUR_RADIUS | \n", "PRICE | \n", "CONDOMINIUM_EXPENSES | \n", "WATER_HEATING_MODE | \n", "FLOOR | \n", "FLOOR_COUNT | \n", "LOT_COUNT | \n", "... | \n", "TERRACE | \n", "TERRACE_SURFACE | \n", "SWIMMING_POOL | \n", "GARDEN | \n", "STANDING | \n", "SMALL_BUILDING | \n", "CORNER_BUILDING | \n", "PUBLICATION_START_DATE | \n", "DEALER_NAME | \n", "DEALER_TYPE | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | \n", "0.0 | \n", "2.346152e+06 | \n", "1.222870e+06 | \n", "1.221350e+06 | \n", "2.346125e+06 | \n", "9.259670e+05 | \n", "0.0 | \n", "1.729398e+06 | \n", "1.076405e+06 | \n", "1.014007e+06 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
| mean | \n", "NaN | \n", "4.573629e+01 | \n", "2.913308e+00 | \n", "4.661755e+02 | \n", "2.338131e+05 | \n", "1.211348e+04 | \n", "NaN | \n", "2.163156e+00 | \n", "9.598156e+00 | \n", "9.701759e+05 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| std | \n", "NaN | \n", "6.898158e+00 | \n", "5.784470e+00 | \n", "3.041044e+02 | \n", "2.942130e+05 | \n", "4.006808e+06 | \n", "NaN | \n", "7.338205e+00 | \n", "5.448560e+03 | \n", "7.661597e+08 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| min | \n", "NaN | \n", "-2.138532e+01 | \n", "-6.177292e+01 | \n", "5.000000e+01 | \n", "1.000000e+00 | \n", "1.000000e-02 | \n", "NaN | \n", "-9.000000e+00 | \n", "0.000000e+00 | \n", "0.000000e+00 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 25% | \n", "NaN | \n", "4.362788e+01 | \n", "1.408058e+00 | \n", "2.500000e+02 | \n", "1.120000e+05 | \n", "7.200000e+02 | \n", "NaN | \n", "1.000000e+00 | \n", "2.000000e+00 | \n", "1.500000e+01 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 50% | \n", "NaN | \n", "4.649788e+01 | \n", "2.441038e+00 | \n", "5.000000e+02 | \n", "1.690000e+05 | \n", "1.200000e+03 | \n", "NaN | \n", "2.000000e+00 | \n", "4.000000e+00 | \n", "4.300000e+01 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 75% | \n", "NaN | \n", "4.883240e+01 | \n", "5.383308e+00 | \n", "7.500000e+02 | \n", "2.590000e+05 | \n", "2.000000e+03 | \n", "NaN | \n", "3.000000e+00 | \n", "5.000000e+00 | \n", "1.030000e+02 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| max | \n", "NaN | \n", "5.108570e+01 | \n", "5.572088e+01 | \n", "1.000000e+03 | \n", "7.500000e+07 | \n", "1.920350e+09 | \n", "NaN | \n", "2.019000e+03 | \n", "5.652800e+06 | \n", "7.610212e+11 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
8 rows × 22 columns
\n", "| \n", " | ID | \n", "AD_URLS | \n", "PROPERTY_TYPE | \n", "DEPT_CODE | \n", "ZIP_CODE | \n", "CITY | \n", "INSEE_CODE | \n", "LATITUDE | \n", "LONGITUDE | \n", "BLUR_RADIUS | \n", "... | \n", "GARDEN | \n", "STANDING | \n", "NEW_BUILD | \n", "SMALL_BUILDING | \n", "CORNER_BUILDING | \n", "PUBLICATION_START_DATE | \n", "DEALER_NAME | \n", "DEALER_TYPE | \n", "REFERENCE_NUMBER | \n", "ENERGY_CLASSIFICATION | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8 | \n", "NaN | \n", "https://www.bienici.com/annonce/adapt-immo-850... | \n", "APARTMENT | \n", "01 | \n", "01750 | \n", "Saint-Laurent-sur-Saône | \n", "01370 | \n", "46.310072 | \n", "4.841813 | \n", "1000.0 | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "8500236963 | \n", "NS | \n", "
| 9 | \n", "NaN | \n", "https://www.bienici.com/annonce/orpi-1-099934E... | \n", "APARTMENT | \n", "01 | \n", "01750 | \n", "Saint-Laurent-sur-Saône | \n", "01370 | \n", "46.304441 | \n", "4.841141 | \n", "50.0 | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "099934E0KVCC | \n", "NaN | \n", "
| 15 | \n", "NaN | \n", "https://www.bienici.com/annonce/fnaim-39964032 | \n", "APARTMENT | \n", "01 | \n", "01000 | \n", "Bourg-en-Bresse | \n", "01053 | \n", "46.205119 | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "25871 | \n", "NaN | \n", "
| 17 | \n", "NaN | \n", "https://www.bienici.com/annonce/ag752451-11061... | \n", "APARTMENT | \n", "01 | \n", "01300 | \n", "Belley | \n", "01034 | \n", "45.742145 | \n", "5.686873 | \n", "1000.0 | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "421601048 | \n", "A | \n", "
| 18 | \n", "NaN | \n", "https://www.bienici.com/annonce/ag752451-11278... | \n", "APARTMENT | \n", "01 | \n", "01140 | \n", "Thoissey | \n", "01420 | \n", "46.169511 | \n", "4.784252 | \n", "1000.0 | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "421601005 | \n", "C | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2346135 | \n", "NaN | \n", "https://www.bienici.com/annonce/fnaim-37667773 | \n", "APARTMENT | \n", "78 | \n", "78100 | \n", "Saint-Germain-en-Laye | \n", "78551 | \n", "48.899041 | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "7000 | \n", "D | \n", "
| 2346136 | \n", "NaN | \n", "https://www.bienici.com/annonce/fnaim-38342524 | \n", "APARTMENT | \n", "78 | \n", "78100 | \n", "Saint-Germain-en-Laye | \n", "78551 | \n", "48.899041 | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "7000 | \n", "D | \n", "
| 2346138 | \n", "NaN | \n", "https://www.bienici.com/annonce/pericles-15861344 | \n", "APARTMENT | \n", "78 | \n", "78100 | \n", "Saint-Germain-en-Laye | \n", "78551 | \n", "48.897947 | \n", "2.100340 | \n", "50.0 | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "15861344 | \n", "E | \n", "
| 2346140 | \n", "NaN | \n", "https://www.bienici.com/annonce/immo-facile-22... | \n", "APARTMENT | \n", "78 | \n", "78100 | \n", "Saint-Germain-en-Laye | \n", "78551 | \n", "48.895068 | \n", "2.096111 | \n", "250.0 | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "3984 | \n", "D | \n", "
| 2346142 | \n", "NaN | \n", "https://www.bienici.com/annonce/snpi-1106555 | \n", "APARTMENT | \n", "78 | \n", "78000 | \n", "Versailles | \n", "78646 | \n", "48.803540 | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "False | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "AZ1-2014 | \n", "NaN | \n", "
919093 rows × 39 columns
\n", "| \n", " | ZIP_CODE | \n", "DEPT_CODE | \n", "CITY | \n", "INSEE_CODE | \n", "LATITUDE | \n", "LONGITUDE | \n", "SURFACE | \n", "HEATING_MODE | \n", "ELEVATOR | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 8 | \n", "01750 | \n", "01 | \n", "Saint-Laurent-sur-Saône | \n", "01370 | \n", "46.310072 | \n", "4.841813 | \n", "45.0 | \n", "NaN | \n", "NaN | \n", "
| 9 | \n", "01750 | \n", "01 | \n", "Saint-Laurent-sur-Saône | \n", "01370 | \n", "46.304441 | \n", "4.841141 | \n", "29.5 | \n", "NaN | \n", "NaN | \n", "
| 15 | \n", "01000 | \n", "01 | \n", "Bourg-en-Bresse | \n", "01053 | \n", "46.205119 | \n", "NaN | \n", "19.0 | \n", "NaN | \n", "NaN | \n", "
| 17 | \n", "01300 | \n", "01 | \n", "Belley | \n", "01034 | \n", "45.742145 | \n", "5.686873 | \n", "59.0 | \n", "NaN | \n", "False | \n", "
| 18 | \n", "01140 | \n", "01 | \n", "Thoissey | \n", "01420 | \n", "46.169511 | \n", "4.784252 | \n", "68.0 | \n", "COLLECTIVE | \n", "NaN | \n", "
| \n", " | ZIP_CODE | \n", "DEPT_CODE | \n", "CITY | \n", "INSEE_CODE | \n", "LATITUDE | \n", "LONGITUDE | \n", "SURFACE | \n", "HEATING_MODE | \n", "ELEVATOR | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 117146 | \n", "152 | \n", "5 | \n", "1749 | \n", "332 | \n", "43.564978 | \n", "7.078969 | \n", "64.0 | \n", "0 | \n", "True | \n", "
| 157490 | \n", "167 | \n", "5 | \n", "813 | \n", "271 | \n", "43.551520 | \n", "1.093733 | \n", "80.0 | \n", "1 | \n", "True | \n", "
| 1139411 | \n", "1188 | \n", "34 | \n", "3856 | \n", "1778 | \n", "48.111339 | \n", "1.093733 | \n", "76.0 | \n", "0 | \n", "True | \n", "
| 1840659 | \n", "2621 | \n", "72 | \n", "3267 | \n", "3876 | \n", "45.430798 | \n", "1.093733 | \n", "35.5 | \n", "1 | \n", "True | \n", "
| 1699429 | \n", "2180 | \n", "62 | \n", "1123 | \n", "3142 | \n", "45.777455 | \n", "1.093733 | \n", "59.0 | \n", "1 | \n", "True | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 2265223 | \n", "3166 | \n", "84 | \n", "1060 | \n", "4888 | \n", "46.505000 | \n", "1.093733 | \n", "98.0 | \n", "1 | \n", "True | \n", "
| 398123 | \n", "536 | \n", "16 | \n", "646 | \n", "732 | \n", "47.082277 | \n", "2.398566 | \n", "63.0 | \n", "1 | \n", "True | \n", "
| 313694 | \n", "358 | \n", "11 | \n", "3611 | \n", "520 | \n", "43.353061 | \n", "5.471240 | \n", "61.0 | \n", "0 | \n", "True | \n", "
| 1157938 | \n", "1173 | \n", "34 | \n", "3856 | \n", "1778 | \n", "48.111339 | \n", "1.093733 | \n", "103.0 | \n", "1 | \n", "True | \n", "
| 761889 | \n", "954 | \n", "30 | \n", "4933 | \n", "1466 | \n", "43.582785 | \n", "1.093733 | \n", "95.0 | \n", "1 | \n", "True | \n", "
643365 rows × 9 columns
\n", "