%matplotlib inline
import os
import numpy as np
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from cycler import cycler
import pooch # download data / avoid re-downloading
from IPython import get_ipython
import lzma # to process zip file
import plotly.express as px
"colorblind")
sns.set_palette(= sns.color_palette("twilight", n_colors=12)
palette = 8 pd.options.display.max_rows
Disclaimer: this course is adapted from the work Pandas tutorial by Joris Van den Bossche. R
users might also want to read Pandas: Comparison with R / R libraries for a smooth start in Pandas.
We start by importing the necessary libraries:
References:
Data loading and preprocessing
Data loading
# url = "https://koumoul.com/s/data-fair/api/v1/datasets/accidents-velos/raw"
= "https://github.com/josephsalmon/HAX712X/raw/main/Data/accidents-velos_2022.csv.xz"
url_db = "./bicycle_db.csv.xz"
path_target = os.path.split(path_target)
path, fname =path, fname=fname, known_hash=None)
pooch.retrieve(url_db, pathwith lzma.open(path_target) as f:
= f.read().decode('utf-8')
file_content
# write the string file_content to a file named fname_uncompressed
with open("./bicycle_db.csv", 'w') as f:
f.write(file_content)
= pd.read_csv("bicycle_db.csv", na_values="", low_memory=False,
df_bikes ={'data': str, 'heure': str, 'departement': str}) dtype
In June 2023, the author decided to change the name of the columns, hence we had to define a dictionary to come back to legacy names:
= {
new2old "hrmn": "heure",
"secuexist": "existence securite",
"grav": "gravite accident",
"dep": "departement",
}
=new2old, inplace=True) df_bikes.rename(columns
'head -5 ./bicycle_db.csv') get_ipython().system(
= 40
pd.options.display.max_columns df_bikes.head()
identifiant accident | date | mois | jour | heure | departement | commune | lat | lon | en agglomeration | type intersection | type collision | luminosite | conditions atmosperiques | type route | circulation | nb voies | profil long route | trace plan route | largeur TPC | largeur route | etat surface | amenagement | situation | categorie usager | gravite accident | sexe | age | motif deplacement | existence securite | usage securite | obstacle fixe heurte | obstacle mobile heurte | localisation choc | manoeuvre avant accident | identifiant vehicule | type autres vehicules | manoeuvre autres vehicules | nombre autres vehicules | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 201100000004 | 2011-09-22 | 09 - septembre | 3 - jeudi | 15 | 59 | 59011 | 50.51861 | 2.93043 | oui | Hors intersection | Trois véhicules et plus - collisions multiples | Plein jour | Normale | Route Départementale | NaN | NaN | Plat | Partie rectiligne | NaN | 58.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 39-40 | Promenade - loisirs | Casque | Oui | NaN | Véhicule | NaN | Sans changement de direction | 201100000004A01 | VL seul | Tournant à gauche | 1.0 |
1 | 201100000004 | 2011-09-22 | 09 - septembre | 3 - jeudi | 15 | 59 | 59011 | 50.51861 | 2.93043 | oui | Hors intersection | Trois véhicules et plus - collisions multiples | Plein jour | Normale | Route Départementale | NaN | NaN | Plat | Partie rectiligne | NaN | 58.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 30-31 | Promenade - loisirs | Casque | Non | NaN | Véhicule | NaN | Sans changement de direction | 201100000004B02 | VL seul | Tournant à gauche | 1.0 |
2 | 201100000006 | 2011-11-14 | 11 - novembre | 0 - lundi | 17 | 59 | 59011 | 50.52684 | 2.93423 | oui | Hors intersection | Deux véhicules - par l’arrière | Nuit avec éclairage public allumé | Normale | Route Départementale | NaN | NaN | NaN | Partie rectiligne | NaN | NaN | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 22-23 | Domicile - travail | Casque | Oui | NaN | NaN | Avant | NaN | 201100000006A01 | VL seul | Arrêté (hors stationnement) | 1.0 |
3 | 201100000020 | 2011-01-27 | 01 - janvier | 3 - jeudi | 18 | 59 | 59256 | 50.55818 | 3.13667 | oui | Hors intersection | Deux véhicules - par le coté | Nuit avec éclairage public allumé | Normale | Voie Communale | NaN | 2.0 | Plat | Partie rectiligne | NaN | 60.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | F | 36-37 | Autre | Equipement réfléchissant | Oui | NaN | Véhicule | Avant droit | Sans changement de direction | 201100000020B02 | VL seul | En stationnement (avec occupants) | 1.0 |
4 | 201100000022 | 2011-09-01 | 09 - septembre | 3 - jeudi | 19 | 59 | 59256 | 50.55448 | 3.12660 | oui | Hors intersection | Deux véhicules - frontale | Crépuscule ou aube | Normale | Hors réseau public | NaN | 2.0 | Plat | Partie rectiligne | NaN | 60.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 15-16 | Promenade - loisirs | Ceinture | NaN | NaN | Véhicule | Avant gauche | Sans changement de direction | 201100000022B02 | VL seul | Sans changement de direction | 1.0 |
'existence securite'].unique() df_bikes[
array(['Casque', 'Equipement réfléchissant', 'Ceinture', 'Autre', nan,
'Dispositif enfants'], dtype=object)
'gravite accident'].unique() df_bikes[
array(['1 - Blessé léger', '2 - Blessé hospitalisé', '3 - Tué',
'0 - Indemne'], dtype=object)
Handle missing values
'date'].hasnans df_bikes[
False
'heure'].hasnans df_bikes[
True
So arbitrarily we fill missing values with 0 (since apparently there is no time 0 reported…to double check in the source.)
'heure':'0'}, inplace=True) df_bikes.fillna({
Date and time processing
Check the date/time format:
'date'] + ' ' + df_bikes['heure'] df_bikes[
0 2011-09-22 15
1 2011-09-22 15
2 2011-11-14 17
3 2011-01-27 18
...
35330 2018-03-21 18
35331 2018-03-31 17
35332 2018-03-31 17
35333 2018-07-31 11
Length: 35334, dtype: object
= pd.to_datetime(
time_improved "date"] + " " + df_bikes["heure"],
df_bikes[format="%Y-%m-%d %H",
="coerce",
errors )
"Time"] = time_improved
df_bikes[# remove rows with NaT
=["Time"], inplace=True)
df_bikes.dropna(subset# set new index
"Time", inplace=True)
df_bikes.set_index(# remove useless columns
=["heure", "date"], inplace=True) df_bikes.drop(columns
df_bikes.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35334 entries, 2011-09-22 15:00:00 to 2018-07-31 11:00:00
Data columns (total 37 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 identifiant accident 35334 non-null int64
1 mois 35334 non-null object
2 jour 35334 non-null object
3 departement 35334 non-null object
4 commune 35334 non-null object
5 lat 35334 non-null float64
6 lon 35334 non-null float64
7 en agglomeration 35334 non-null object
8 type intersection 35333 non-null object
9 type collision 35330 non-null object
10 luminosite 35334 non-null object
11 conditions atmosperiques 35333 non-null object
12 type route 35323 non-null object
13 circulation 132 non-null object
14 nb voies 31054 non-null float64
15 profil long route 32748 non-null object
16 trace plan route 31556 non-null object
17 largeur TPC 2705 non-null float64
18 largeur route 18503 non-null float64
19 etat surface 33694 non-null object
20 amenagement 3880 non-null object
21 situation 32742 non-null object
22 categorie usager 35334 non-null object
23 gravite accident 35334 non-null object
24 sexe 35334 non-null object
25 age 35323 non-null object
26 motif deplacement 27579 non-null object
27 existence securite 34789 non-null object
28 usage securite 33690 non-null object
29 obstacle fixe heurte 803 non-null object
30 obstacle mobile heurte 28615 non-null object
31 localisation choc 30129 non-null object
32 manoeuvre avant accident 31396 non-null object
33 identifiant vehicule 35334 non-null object
34 type autres vehicules 30531 non-null object
35 manoeuvre autres vehicules 28143 non-null object
36 nombre autres vehicules 30531 non-null float64
dtypes: float64(6), int64(1), object(30)
memory usage: 10.2+ MB
= df_bikes.loc[
df_bike2 "gravite accident", "existence securite", "age", "sexe"]
:, [
]# df.replace(to_replace={'col1' : {99 : 0}, 'col2' : {99 : 0}},
# inplace=True)
={"existence securite": {"Inconnu": np.nan}}, inplace=True)
df_bike2.replace(to_replace=True) df_bike2.dropna(inplace
gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué |
---|---|---|---|---|
existence securite | ||||
Autre | 6.380605 | 62.294583 | 29.143843 | 2.180970 |
Casque | 7.052803 | 55.608281 | 33.454292 | 3.884624 |
Ceinture | 3.414634 | 15.000000 | 69.268293 | 12.317073 |
Dispositif enfants | 7.317073 | 64.634146 | 24.390244 | 3.658537 |
Equipement réfléchissant | 4.994055 | 56.242568 | 33.848593 | 4.914784 |
gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué |
---|---|---|---|---|
existence securite | ||||
Autre | 6.380605 | 62.294583 | 29.143843 | 2.180970 |
Casque | 7.052803 | 55.608281 | 33.454292 | 3.884624 |
Ceinture | 3.414634 | 15.000000 | 69.268293 | 12.317073 |
Dispositif enfants | 7.317073 | 64.634146 | 24.390244 | 3.658537 |
Equipement réfléchissant | 4.994055 | 56.242568 | 33.848593 | 4.914784 |
sexe
F 0.239911
M 0.760089
dtype: float64
gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué | All |
---|---|---|---|---|---|
sexe | |||||
F | 13.40564 | 27.903906 | 19.946115 | 15.884194 | 23.868538 |
M | 86.59436 | 72.096094 | 80.053885 | 84.115806 | 76.131462 |
Data visualization
Note that in the dataset, the information on the level of bike practice by gender is missing.
Time series visualization
"weekday"] = df_bikes.index.day_of_week # Monday=0, Sunday=6
df_bikes['weekday', df_bikes.index.hour])['sexe'].count() df_bikes.groupby([
weekday Time
0 0 16
1 12
2 2
3 1
...
6 20 122
21 73
22 42
23 32
Name: sexe, Length: 168, dtype: int64
'weekday', df_bikes.index.hour])['age'].count() df_bikes.groupby([
weekday Time
0 0 16
1 12
2 2
3 1
...
6 20 122
21 73
22 42
23 32
Name: age, Length: 168, dtype: int64
The two last results are the same, no matter if you choose the 'age'
or 'sexe'
variable.
Create a daily profile per day of the week:
= (
df_polar "weekday", df_bikes.index.hour])["sexe"]
df_bikes.groupby([
.count()
.reset_index()# all variable are similar in this sense, sexe could be replaced by age for instance here. XXX to simplify
)
= df_polar.astype({"Time": str}, copy=False)
df_polar "weekday"] = df_polar["weekday"].apply(lambda x: calendar.day_abbr[x])
df_polar[={"sexe": "accidents"}, inplace=True) df_polar.rename(columns
Display these daily profiles
= 8 # 7 days, but 8 colors help to have weekends days' color closer
n_colors = px.colors.sample_colorscale(
colors "mrybm", [n / (n_colors - 1) for n in range(n_colors)]
)
= px.line_polar(
fig
df_polar,="accidents",
r="Time",
theta="weekday",
color=True,
line_close=[0, 600],
range_r=0,
start_angle=colors,
color_discrete_sequence="seaborn",
template="Daily accident profile: weekday effect?",
title
)
fig.show()
In plotly
the figure is interactive. If you click on the legend on the right, you can select the day you want to see. It is very convenient to compare days two by two for instance.
Create a daily profile per month:
"month"] = df_bikes.index.month # Janvier=0, .... Decembre=11
df_bikes[# df_bikes['month'] = df_bikes['month'].apply(lambda x: calendar.month_abbr[x])
df_bikes.head()= (
df_bikes_month "month", df_bikes.index.hour])["age"]
df_bikes.groupby([
.count()=0)
.unstack(level )
= (
df_polar2 "month", df_bikes.index.hour])["sexe"]
df_bikes.groupby([
.count()
.reset_index()# all variable are similar in this sense, sexe could be replaced by age for instance here. XXX to simplify
)
= df_polar2.astype({"Time": str}, copy=False)
df_polar2 ={"sexe": "accidents"}, inplace=True)
df_polar2.rename(columns"month"] = df_polar2["month"].apply(lambda x: calendar.month_abbr[x]) df_polar2[
Display these daily profiles :
# create a cyclical color scale for 12 values:
= 12
n_colors = px.colors.sample_colorscale(
colors "mrybm", [n / (n_colors - 1) for n in range(n_colors)]
)
= px.line_polar(
fig
df_polar2,="accidents",
r="Time",
theta="month",
color=True,
line_close=[0, 410],
range_r=0,
start_angle=colors,
color_discrete_sequence="seaborn",
template="Daily accident profile: weekday effect?",
title
) fig.show()
Geographic visualization
In this part, we will use the geopandas library to visualize the data on a map, along with plotly
for interactivity.
= "./dpt_population.csv"
path_target = "https://public.opendatasoft.com/explore/dataset/population-francaise-par-departement-2018/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B"
url = os.path.split(path_target)
path, fname =path, fname=fname, known_hash=None) pooch.retrieve(url, path
'/home/jsalmon/Documents/Mes_cours/Montpellier/HAX712X/Courses/Pandas/dpt_population.csv'
= pd.read_csv("dpt_population.csv", sep=";", low_memory=False)
df_dtp_pop
={"code": {"2A": "20A", "2B": "20B"}}, inplace=True)
df_dtp_pop.replace(to_replace=["Code Département"], inplace=True)
df_dtp_pop.sort_values(by
={"code": {"2A": "20A", "2B": "20B"}}, inplace=True)
df_bikes.replace(to_replace=["departement"], inplace=True)
df_bikes.sort_values(by# Clean extra departements
= df_bikes[df_bikes["departement"].isin(df_dtp_pop["Code Département"])]
df_bikes
= df_bikes.groupby(["departement"], as_index=True, sort=True).size()
gd
= {"code": gd.index, "# Accidents per million": gd.values}
data = pd.DataFrame(data)
df "# Accidents per million"] = (
df["# Accidents per million"].values
df[* 10000.0
/ df_dtp_pop["Population"].values
)
We now need to download the .geojson
file containing the geographic information for each department. We will use the pooch
library to download the file and store it locally.
= "./departements.geojson"
path_target # url = "https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements-avec-outre-mer.geojson"
= "https://raw.githubusercontent.com/gregoiredavid/france-geojson/master/departements-version-simplifiee.geojson"
url = os.path.split(path_target)
path, fname =path, fname=fname, known_hash=None) pooch.retrieve(url, path
'/home/jsalmon/Documents/Mes_cours/Montpellier/HAX712X/Courses/Pandas/departements.geojson'
First, you have to handle Corsican departments, which are not in the same format as the others.
import plotly.express as px
import geopandas
= geopandas.read_file("departements.geojson")
departement
={"code": {"2A": "20A", "2B": "20B"}}, inplace=True)
departement.replace(to_replace
=["code"], inplace=True)
departement.sort_values(by
= ["0" + str(i) for i in range(1, 10)]
a = [str(i) for i in range(1, 10)]
b = dict(zip(a, b))
dict_replace
departement.replace(={"code": {"20A": "2A", "20B": "2B", **dict_replace}}, inplace=True
to_replace
)
df.replace(={"code": {"20A": "2A", "20B": "2B", **dict_replace}}, inplace=True
to_replace
)
"code", inplace=True)
departement.set_index(print(departement["nom"].head(22))
code
1 Ain
2 Aisne
3 Allier
4 Alpes-de-Haute-Provence
...
19 Corrèze
2A Corse-du-Sud
2B Haute-Corse
21 Côte-d'Or
Name: nom, Length: 22, dtype: object
Once this is done, you can plot the data on a map.
= px.choropleth_mapbox(
fig
df,=departement,
geojson="code",
locations="# Accidents per million",
color=(0, df["# Accidents per million"].max()),
range_color="rdbu",
color_continuous_scale={"lat": 44, "lon": 2},
center=3.55,
zoom="white-bg",
mapbox_style
)=dict(type="choroplethmapbox"))
fig.update_traces(selector
fig.update_layout(="Accidents per million inhabitants by department",
title_text=dict(thickness=20, orientation="h", y=0.051, x=0.5),
coloraxis_colorbar
) fig.show()
References
- Other interactive tools for data visualization: Altair, Bokeh. See comparisons by Aarron Geller: link
- An interesting tutorial on Altair: Altair introduction