Data Manipulation with pandas
Transforming Data
Introducing DataFrames
# Exploring a DataFrame: .head()
dogs.head()
# Exploring a DataFrame: .info()
dogs.info()
# Exploring a DataFrame: .shape
dogs.shape
# Exploring a DataFrame: .describe()
dogs.describe()
# Components of a DataFrame: .values
dogs.values
# Components of a DataFrame: .columns and .index
dogs.columns
dogs.index
Sorting and subsetting
#Sorting by multiple variables
"weight_kg", "height_cm"], ascending=[True, False])
dogs.sort_values([
#Subsetting based on dates
"date_of_birth"] > "2015-01-01"]
dogs[dogs[
#Subsetting based on multiple conditions
= dogs["breed"] == "Labrador"
is_lab = dogs["color"] == "Brown"
is_brown & is_brown]
dogs[is_lab "breed"] == "Labrador") & (dogs["color"] == "Brown") ]
dogs[ (dogs[
#Subsetting using .isin()
= dogs["color"].isin(["Black", "Brown"])
is_black_or_brown dogs[is_black_or_brown]
New columns
# Adding a new column
"height_m"] = dogs["height_cm"] / 100 dogs[
Aggregating Data
Summary statistics
#Summarizing numerical data
"height_cm"].mean()
dogs[
.median() , .mode()min() , .max()
.
.var() , .std()sum()
.
.quantile()
#The .agg() method
def pct30(column):
return column.quantile(0.3)
"weight_kg"].agg(pct30)
dogs[
#Multiple summaries
def pct40(column):
return column.quantile(0.4)
"weight_kg"].agg([pct30, pct40])
dogs[
#Cumulative sum
"weight_kg"].cumsum()
dogs[
#Cumulative statistics
.cummax()
.cummin() .cumprod()
Counting
#Dropping duplicate names
="name")
vet_visits.drop_duplicates(subset
#Dropping duplicate pairs
= vet_visits.drop_duplicates(subset=["name", "breed"])
unique_dogs
#Counting
"breed"].value_counts(sort=True) unique_dogs[
Grouped summary statistics
#Summaries by group
"color"] == "Black"]["weight_kg"].mean()
dogs[dogs["color"] == "Brown"]["weight_kg"].mean()
dogs[dogs[
#Grouped summaries
"color")["weight_kg"].mean()
dogs.groupby(
#Multiple grouped summaries
"color")["weight_kg"].agg([min, max, sum])
dogs.groupby(
#Grouping by multiple variables
"color", "breed"])["weight_kg"].mean()
dogs.groupby([
#Many groups, many summaries
"color", "breed"])[["weight_kg", "height_cm"]].mean() dogs.groupby([
Pivot tables
#pivot table
="weight_kg",index="color")
dogs.pivot_table(values
#Different statistics
import numpy as np
="weight_kg", index="color", aggfunc=np.median)
dogs.pivot_table(values
#Multiple statistics
="weight_kg", index="color", aggfunc=[np.mean, np.median])
dogs.pivot_table(values
#Pivot on two variables
"color", "breed"])["weight_kg"].mean()
dogs.groupby([="weight_kg", index="color", columns="breed")
dogs.pivot_table(values
#Filling missing values in pivot tables
="weight_kg", index="color", columns="breed", fill_value=0)
dogs.pivot_table(values
# Summing with pivot tables
="weight_kg", index="color", columns="breed",
dogs.pivot_table(values=0, margins=True) fill_value
Slicing and Indexing
Explicit indexes
# Setting a column as the index
= dogs.set_index("name")
dogs_ind
# Removing an index
dogs_ind.reset_index()
# Dropping an index
=True)
dogs_ind.reset_index(drop
# Indexes make subsetting simpler
"name"].isin(["Bella", "Stella"])]
dogs[dogs[# versus
"Bella", "Stella"]]
dogs_ind.loc[[
# Multi-level indexes a.k.a. hierarchical indexes
= dogs.set_index(["breed", "color"])
dogs_ind3
# Subset the outer level with a list
"Labrador", "Chihuahua"]]
dogs_ind3.loc[[
# Subset inner levels with a list of tuples
"Labrador", "Brown"), ("Chihuahua", "Tan")]]
dogs_ind3.loc[[(
# Sorting by index values
dogs_ind3.sort_index()
# Controlling sort_index
=["color", "breed"], ascending=[True, False]) dogs_ind3.sort_index(level
Slicing and subsetting with .loc and .iloc
# Sort the index before you slice
= dogs.set_index(["breed", "color"]).sort_index()
dogs_srt
# Slicing the outer index level
"Chow Chow":"Poodle"]
dogs_srt.loc[
# Slicing the inner index levels correctly
"Labrador", "Brown"):("Schnauzer", "Grey")]
dogs_srt.loc[(
# Slicing columns
"name":"height_cm"]
dogs_srt.loc[:,
# Slice twice
dogs_srt.loc["Labrador", "Brown"):("Schnauzer", "Grey"),
("name":"height_cm"]
# Dog days
= dogs.set_index("date_of_birth").sort_index()
dogs
# Slicing by dates
# Get dogs with date_of_birth between 2014-08-25 and 2016-09-16
"2014-08-25":"2016-09-16"]
dogs.loc[
# Slicing by partial dates
# Get dogs with date_of_birth between 2014-01-01 and 2016-12-31
"2014":"2016"]
dogs.loc[
# Subsetting by row/column number
print(dogs.iloc[2:5, 1:4])
Working with pivot tables
# Pivoting the dog pack
= dog_pack.pivot_table(
dogs_height_by_breed_vs_color "height_cm", index="breed", columns="color")
# The axis argument
="index")
dogs_height_by_breed_vs_color.mean(axis
# Calculating summary stats across columns
="columns") dogs_height_by_breed_vs_color.mean(axis
Creating and Visualizing DataFrames
Visualizing your data
# Histograms
import matplotlib.pyplot as plt
"height_cm"].hist(bins=20)
dog_pack[
# Bar plots
= dog_pack.groupby("breed")["weight_kg"].mean()
avg_weight_by_breed ="bar", title="Mean Weight by Dog Breed")
avg_weight_by_breed.plot(kind
# Line plots
sully.head()="date", y="weight_kg", kind="line")
sully.plot(x
# Rotating axis labels
="date", y="weight_kg", kind="line", rot=45)
sully.plot(x
# Scatter plots
="height_cm", y="weight_kg", kind="scatter")
dog_pack.plot(x
# Layering plots
"sex"]=="F"]["height_cm"].hist()
dog_pack[dog_pack["sex"]=="M"]["height_cm"].hist()
dog_pack[dog_pack[
# Add a legend
"F", "M"])
plt.legend([
# Transparency
"sex"]=="F"]["height_cm"].hist(alpha=0.7)
dog_pack[dog_pack["sex"]=="M"]["height_cm"].hist(alpha=0.7)
dog_pack[dog_pack["F", "M"]) plt.legend([
Missing values
# Detecting missing values
dogs.isna()
# Detecting any missing values
any()
dogs.isna().
# Counting missing values
sum()
dogs.isna().
# Plotting missing values
import matplotlib.pyplot as plt
sum().plot(kind="bar")
dogs.isna().
plt.show()
# Removing rows containing missing values
dogs.dropna()
# Replacing missing values
0) dogs.fillna(
Reading and writing CSVs
# CSV to DataFrame
import pandas as pd
= pd.read_csv("new_dogs.csv")
new_dogs
# DataFrame to CSV
"new_dogs_with_bmi.csv")
new_dogs.to_csv(
# CSV to dataframe parsing dates, and having date as index
= pd.read_csv(prefix+'climate_change.csv', parse_dates=['date'], index_col='date') climate_change
Joining data with pandas
Data merging basics
Inner join
# Inner join
= wards.merge(census, on='ward')
wards_census
# Suffixes
= wards.merge(census, on='ward', suffixes=('_ward','_cen')) wards_census
One-to-many relationships
# One-to-many example
= wards.merge(licenses, on='ward', suffixes=('_ward','_lic')) ward_licenses
Merging multiple DataFrames
# Single merge
=['address','zip'])
grants.merge(licenses, on
# Merging multiple tables
= grants.merge(licenses, on=['address','zip']) \
grants_licenses_ward ='ward', suffixes=('_bus','_ward'))
.merge(wards, on
# Plot Results
import matplotlib.pyplot as plt
'ward').agg('sum').plot(kind='bar', y='grant')
grant_licenses_ward.groupby(
Merging Tables With Different Join Types
Left join
# Merge with left join
= movies.merge(taglines, on='id', how='left') movies_taglines
Other joins
# Merge with right join
= movies.merge(tv_genre, how='right',
tv_movies ='id', right_on='movie_id')
left_on
# Merge with outer join
= family.merge(comedy, on='movie_id', how='outer',
family_comedy =('_fam', '_com')) suffixes
Merging a table to itself
# Merging a table to itself
= sequels.merge(sequels, left_on='sequel', right_on='id',
original_sequels =('_org','_seq')) suffixes
Merging on indexes
# Setting an index
= pd.read_csv('tmdb_movies.csv', index_col=['id'])
movies
# Merging on index
= movies.merge(taglines, on='id', how='left')
movies_taglines
# MultiIndex merge
= samuel.merge(casts, on=['movie_id','cast_id'])
samuel_casts
# Index merge with left_on and right_on
= movies.merge(movie_to_genres, left_on='id', left_index=True,
movies_genres ='movie_id', right_index=True) right_on
Advanced Merging and Concatenating
Filtering joins
###########
# semi-join
# Step 1 - semi-join
= genres.merge(top_tracks, on='gid')
genres_tracks
# Step 2 - semi-join
'gid'].isin(genres_tracks['gid'])
genres[
# Step 3 - semi-join
= genres.merge(top_tracks, on='gid')
genres_tracks = genres[genres['gid'].isin(genres_tracks['gid'])]
top_genres
###########
# anti-join
# Step 1 - anti-join
= genres.merge(top_tracks, on='gid', how='left', indicator=True)
genres_tracks
# Step 2 - anti-join
= genres_tracks.loc[genres_tracks['_merge'] == 'left_only', 'gid']
gid_list
# Step 3 - anti-join
= genres.merge(top_tracks, on='gid', how='left', indicator=True)
genres_tracks = genres_tracks.loc[genres_tracks['_merge'] == 'left_only','gid']
gid_list = genres[genres['gid'].isin(gid_list)] non_top_genres
Concatenate DataFrames together vertically
# Basic concatenation
pd.concat([inv_jan, inv_feb, inv_mar])
# Ignoring the index
pd.concat([inv_jan, inv_feb, inv_mar],=True)
ignore_index
# Setting labels to original tables
pd.concat([inv_jan, inv_feb, inv_mar],=False,
ignore_index=['jan','feb','mar'])
keys
# Concatenate tables with different column names
pd.concat([inv_jan, inv_feb],=True)
sort
# Concatenate tables with different column names
pd.concat([inv_jan, inv_feb],='inner')
join
# Append the tables
inv_jan.append([inv_feb, inv_mar],=True,
ignore_index=True) sort
Verifying integrity
# Validating merges
=None) :
.merge(validateif merge is of specified type
Checks 'one_to_one'
'one_to_many'
'many_to_one'
'many_to_many'
# Merge validate: one_to_one
='tid',
tracks.merge(specs, on='one_to_one')
validate
# Merge validate: one_to_many
='aid',
albums.merge(tracks, on='one_to_many')
validate
# Verifying concatenations
=False) :
.concat(verify_integrity
Check whether the new concatenated index contains duplicatesis False Default value
Merging Ordered and Time-Series Data
Using merge_ordered()
# Merging stock data
import pandas as pd
='date', suffixes=('_aapl','_mcd'))
pd.merge_ordered(appl, mcd, on
# Forward fill example
='date',
pd.merge_ordered(appl, mcd, on=('_aapl','_mcd'),
suffixes='ffill') fill_method
Using merge_asof()
# merge_asof() example
='date_time',
pd.merge_asof(visa, ibm, on=('_visa','_ibm'))
suffixes
# merge_asof() example with direction
=['date_time'],
pd.merge_asof(visa, ibm, on=('_visa','_ibm'),
suffixes='forward') direction
Selecting data with .query()
# Querying on a single condition
'nike >= 90')
stocks.query(
# Querying on a multiple conditions, "and", "or"
'nike > 90 and disney < 140')
stocks.query('nike > 96 or disney < 98')
stocks.query(
# Using .query() to select text
'stock=="disney" or (stock=="nike" and close < 90)') stocks_long.query(
Reshaping data with .melt()
# Example of .melt()
= social_fin.melt(id_vars=['financial','company'])
social_fin_tall
# Melting with value_vars
= social_fin.melt(id_vars=['financial','company'],
social_fin_tall =['2018','2017'])
value_vars
# Melting with column names
= social_fin.melt(id_vars=['financial','company'],
social_fin_tall =['2018','2017'],
value_vars=['year'], value_name='dollars') var_name