import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
general = pd.read_csv('/Users/servandotorres/Documents/data/households/general.csv')
economy = pd.read_csv('/Users/servandotorres/Documents/data/households/economy.csv')
house_info = pd.read_csv('/Users/servandotorres/Documents/data/households/households_info.csv')
social_services_access = pd.read_csv('/Users/servandotorres/Documents/data/households/social_services_access.csv')
economy_1 = pd.read_csv('/Users/servandotorres/Documents/data/households/economy.csv', index_col='village')
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
gender |
hh_head |
relation_hh_head |
gender_hh_head |
district |
ward |
village |
subvillage |
house_condition |
0 |
V4-01 |
Male |
Yes |
NaN |
NaN |
Muheza |
Misalai |
Kazita |
Kwemuyu |
The house is not good |
1 |
V4-02 |
Female |
Yes |
NaN |
NaN |
Muheza |
Misalai |
Kazita |
Kwemuyu |
The house is not good |
2 |
V4-03 |
Female |
Yes |
NaN |
NaN |
Muheza |
Misalai |
Kazita |
Kwemuyu |
The house is normal |
3 |
V4-04 |
Female |
Yes |
NaN |
NaN |
Muheza |
Misalai |
Kazita |
Kwemuyu |
The house is normal |
4 |
V4-05 |
Male |
Yes |
NaN |
NaN |
Muheza |
Misalai |
Kazita |
Kwemuyu |
Good house |
general.isnull().sum()
#esta un poco sucio
id 0
gender 0
hh_head 0
relation_hh_head 226
gender_hh_head 226
district 0
ward 0
village 0
subvillage 0
house_condition 46
dtype: int64
general.head()
general.shape
# Estadistica descriptiva
general.describe()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
gender |
hh_head |
relation_hh_head |
gender_hh_head |
district |
ward |
village |
subvillage |
house_condition |
count |
339 |
339 |
339 |
113 |
113 |
339 |
339 |
339 |
339 |
293 |
unique |
339 |
2 |
2 |
5 |
2 |
1 |
2 |
8 |
39 |
75 |
top |
V3-09 |
Male |
Yes |
husband / wife |
Male |
Muheza |
Misalai |
Misalai |
Mlalo |
Normal house |
freq |
1 |
179 |
226 |
74 |
100 |
339 |
224 |
70 |
20 |
77 |
#Descricion categorica
general.describe(include=[np.object, pd.Categorical]).T
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
count |
unique |
top |
freq |
id |
339 |
339 |
V3-09 |
1 |
gender |
339 |
2 |
Male |
179 |
hh_head |
339 |
2 |
Yes |
226 |
relation_hh_head |
113 |
5 |
husband / wife |
74 |
gender_hh_head |
113 |
2 |
Male |
100 |
district |
339 |
1 |
Muheza |
339 |
ward |
339 |
2 |
Misalai |
224 |
village |
339 |
8 |
Misalai |
70 |
subvillage |
339 |
39 |
Mlalo |
20 |
house_condition |
293 |
75 |
Normal house |
77 |
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
acres_own_land |
acres_rent_land |
acres_not_paying_land |
certificate |
acres_cultivate_land |
land_use |
decision_maker |
savings |
0 |
V4-01 |
Kazita |
7.0 |
0.0 |
0.0 |
No |
7.0 |
Crop cultivation |
father |
Yes |
1 |
V4-02 |
Kazita |
0.5 |
0.0 |
0.0 |
No |
0.5 |
Crop cultivation |
mother |
No |
2 |
V4-03 |
Kazita |
4.0 |
0.0 |
0.0 |
No |
1.0 |
Crop cultivation |
mother |
Yes |
3 |
V4-04 |
Kazita |
1.5 |
0.0 |
0.0 |
Yes |
1.5 |
Crop cultivation |
both |
Yes |
4 |
V4-05 |
Kazita |
0.0 |
0.0 |
0.0 |
No |
0.0 |
NaN |
father |
Yes |
Vision global de el tomador de decisiones
sns.countplot(y='decision_maker', data=economy)
<matplotlib.axes._subplots.AxesSubplot at 0x1131985c0>
#Valores nulos
economy.isnull().sum()
id 0
village 0
acres_own_land 0
acres_rent_land 2
acres_not_paying_land 2
certificate 1
acres_cultivate_land 2
land_use 6
decision_maker 0
savings 0
dtype: int64
Superficie en propiedad por pueblo
ax = sns.barplot(x='village', y='acres_own_land', data=economy)
ax.figure.set_size_inches(16, 4)
ax = sns.barplot(x='village', y='acres_own_land', hue='land_use',
data=economy, palette='Greys')
ax.figure.set_size_inches(16,4)
ax = sns.barplot(x='village', y='acres_own_land', data=economy)
ax.figure.set_size_inches(16, 4)
ax = sns.barplot(x='village', y='acres_not_paying_land', data=economy)
ax.figure.set_size_inches(16, 4)
ax = sns.barplot(x='village', y='acres_rent_land', data=economy)
ax.figure.set_size_inches(16, 4)
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
household_member |
gender |
age |
edu_level |
0 |
V1-03 |
Misalai |
1st member |
male |
80 |
Primary |
1 |
V4-01 |
Kazita |
2nd member |
female |
42 |
No formal education |
2 |
V4-01 |
Kazita |
3rd member |
female |
40 |
No formal education |
3 |
V4-01 |
Kazita |
1st member |
male |
55 |
Secundary |
4 |
V4-01 |
Kazita |
5th member |
female |
11 |
Primary |
social_services_access.head()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
primary_school |
service_quality |
secundary_school |
service_quality.1 |
market_place |
service_quality.2 |
dispensary |
service_quality.3 |
... |
agriculture_storage |
service_quality.8 |
agricultural_extension_service |
service_quality.9 |
veterinary |
service_quality.10 |
forest_service |
service_quality.11 |
water_service |
service_quality.12 |
0 |
V4-01 |
Kazita |
yes |
Bad |
no |
No |
no |
No |
no |
No |
... |
no |
No |
no |
No |
no |
No |
no |
No |
no |
No |
1 |
V4-02 |
Kazita |
yes |
Good |
yes |
Good |
no |
No |
no |
No |
... |
no |
No |
no |
No |
yes |
Bad |
yes |
Bad |
no |
No |
2 |
V4-03 |
Kazita |
yes |
Good |
yes |
Good |
yes |
Good |
yes |
Good |
... |
no |
No |
no |
No |
no |
No |
yes |
Good |
yes |
Good |
3 |
V4-04 |
Kazita |
yes |
Good |
yes |
Good |
yes |
Good |
no |
No |
... |
no |
No |
no |
No |
yes |
Good |
yes |
Good |
yes |
Good |
4 |
V4-05 |
Kazita |
yes |
Good |
yes |
Good |
yes |
Good |
yes |
Good |
... |
no |
No |
no |
No |
no |
No |
no |
No |
yes |
Good |
5 rows × 28 columns
drinking_water_distance = pd.read_csv('/Users/servandotorres/Documents/data/water-access/drinking_water_distance.csv')
water_collection = pd.read_csv('/Users/servandotorres/Documents/data/water-access/water_collection.csv')
water_collection = pd.read_csv('/Users/servandotorres/Documents/data/water-access/water_collection.csv')
water_payment = pd.read_csv('/Users/servandotorres/Documents/data/water-access/water_payment.csv')
water_quality = pd.read_csv('/Users/servandotorres/Documents/data/water-access/water_quality.csv')
water_sanitation = pd.read_csv('/Users/servandotorres/Documents/data/water-access/water_sanitation.csv')
drinking_water_distance['village']
import numpy as np
np.unique(drinking_water_distance['village'])
array(['8 =Kwelumbizi', 'Kazita', 'Kizerui', 'Kwemsoso', 'Mgambo',
'Misalai', 'Shambangeda', 'Zirai'], dtype=object)
drinking_water_distance.head()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
rainy_season |
dry_season |
0 |
V4-01 |
Kazita |
1- 2 hours |
1- 2 hours |
1 |
V4-02 |
Kazita |
5 - 10 min |
5 - 10 min |
2 |
V4-03 |
Kazita |
5 - 10 min |
5 - 10 min |
3 |
V4-04 |
Kazita |
1- 2 hours |
1- 2 hours |
4 |
V4-05 |
Kazita |
< 5 min |
< 5 min |
water_collection.head()
#water_collection.describe(include=[np.object, pd.Categorical]).T
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
collector |
no_buckets |
0 |
V4-01 |
Kazita |
Adult women |
6 - 7 |
1 |
V4-02 |
Kazita |
Adult women |
1 - 2 |
2 |
V4-03 |
Kazita |
Adult women |
2 - 3 |
3 |
V4-04 |
Kazita |
Adult women |
5 - 6 |
4 |
V4-05 |
Kazita |
Adult men |
1 - 2 |
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
rainy_season |
dry_season |
0 |
V4-01 |
Kazita |
No |
No |
1 |
V4-02 |
Kazita |
No |
No |
2 |
V4-03 |
Kazita |
No |
No |
3 |
V4-04 |
Kazita |
No |
No |
4 |
V4-05 |
Kazita |
No |
No |
sns.countplot(y='dry_season', data=water_payment)
<matplotlib.axes._subplots.AxesSubplot at 0x11346a400>
sns.countplot(y='rainy_season', data=water_payment)
<matplotlib.axes._subplots.AxesSubplot at 0x1135ee710>
water_quality.describe(include=[np.object, pd.Categorical]).T
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
count |
unique |
top |
freq |
id |
339 |
339 |
V3-09 |
1 |
village |
339 |
8 |
Misalai |
70 |
dry_season |
339 |
5 |
Clean |
152 |
rainy_season |
339 |
5 |
Clean |
127 |
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
floor_material |
wall_material |
roof_material |
toilet_type |
toilet_satisfaction |
water_in_toilet |
rubbish_pit |
utensils_rack |
0 |
V4-01 |
Kazita |
earth |
mud bricks |
galvanised metal sheets / iron sheets |
flush toilet with flushing system |
fairly satisfied |
Water bucket in the toilet |
Yes |
Yes |
1 |
V4-02 |
Kazita |
earth |
mud bricks |
galvanised metal sheets / iron sheets |
simple pit latrine unimproved |
unsatisifed |
Water bucket in the toilet |
Yes |
Yes |
2 |
V4-03 |
Kazita |
earth |
mud and poles or stones |
galvanised metal sheets / iron sheets |
flush toilet |
very satisfied |
Water bucket in the toilet |
Yes |
Yes |
3 |
V4-04 |
Kazita |
earth |
mud and poles or stones |
galvanised metal sheets / iron sheets |
flush toilet with flushing system |
fairly satisfied |
No water in the toilet |
No |
Yes |
4 |
V4-05 |
Kazita |
earth |
baked or burnt bricks |
galvanised metal sheets / iron sheets |
flush toilet with flushing system |
fairly satisfied |
Water bucket in the toilet |
Yes |
Yes |
water_sanitation.describe(include=[np.object, pd.Categorical]).T
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
count |
unique |
top |
freq |
id |
339 |
339 |
V3-09 |
1 |
village |
339 |
8 |
Misalai |
70 |
floor_material |
339 |
4 |
earth |
261 |
wall_material |
339 |
6 |
mud and poles or stones |
180 |
roof_material |
339 |
5 |
galvanised metal sheets / iron sheets |
291 |
toilet_type |
339 |
6 |
simple pit latrine unimproved |
189 |
toilet_satisfaction |
339 |
4 |
fairly satisfied |
157 |
water_in_toilet |
337 |
4 |
Water bucket in the toilet |
311 |
rubbish_pit |
339 |
2 |
Yes |
211 |
utensils_rack |
339 |
2 |
Yes |
202 |
np.unique(water_quality['dry_season'])
water_quality.head()
<style>
.dataframe thead tr:only-child th {
text-align: right;
}
.dataframe thead th {
text-align: left;
}
.dataframe tbody tr th {
vertical-align: top;
}
</style>
|
id |
village |
dry_season |
rainy_season |
0 |
V4-01 |
Kazita |
Quite clean |
Quite clean |
1 |
V4-02 |
Kazita |
Quite clean |
Clean |
2 |
V4-03 |
Kazita |
Clean |
Clean |
3 |
V4-04 |
Kazita |
Clean |
Clean |
4 |
V4-05 |
Kazita |
Quite clean |
Quite clean |
ax = sns.countplot(y='dry_season', data=water_quality)
ax
<matplotlib.axes._subplots.AxesSubplot at 0x11125cf98>
Calidad del agua por estacion
sns.countplot(y='dry_season', data=water_quality)
<matplotlib.axes._subplots.AxesSubplot at 0x1a1acab860>
sns.countplot(y='rainy_season', data=water_quality)
<matplotlib.axes._subplots.AxesSubplot at 0x113643d30>