pandas cheatsheet
so much different ways to do something with pandas, ...
- Data Manipulation with pandas
- Joining data with pandas
Introducing DataFrames
# Exploring a DataFrame: .head()
dogs.head()
# Exploring a DataFrame: .info()
dogs.info()
# Exploring a DataFrame: .shape
dogs.shape
# Exploring a DataFrame: .describe()
dogs.describe()
# Components of a DataFrame: .values
dogs.values
# Components of a DataFrame: .columns and .index
dogs.columns
dogs.index
Sorting and subsetting
#Sorting by multiple variables
dogs.sort_values(["weight_kg", "height_cm"], ascending=[True, False])
#Subsetting based on dates
dogs[dogs["date_of_birth"] > "2015-01-01"]
#Subsetting based on multiple conditions
is_lab = dogs["breed"] == "Labrador"
is_brown = dogs["color"] == "Brown"
dogs[is_lab & is_brown]
dogs[ (dogs["breed"] == "Labrador") & (dogs["color"] == "Brown") ]
#Subsetting using .isin()
is_black_or_brown = dogs["color"].isin(["Black", "Brown"])
dogs[is_black_or_brown]
Summary statistics
#Summarizing numerical data
dogs["height_cm"].mean()
.median() , .mode()
.min() , .max()
.var() , .std()
.sum()
.quantile()
#The .agg() method
def pct30(column):
return column.quantile(0.3)
dogs["weight_kg"].agg(pct30)
#Multiple summaries
def pct40(column):
return column.quantile(0.4)
dogs["weight_kg"].agg([pct30, pct40])
#Cumulative sum
dogs["weight_kg"].cumsum()
#Cumulative statistics
.cummax()
.cummin()
.cumprod()
Grouped summary statistics
#Summaries by group
dogs[dogs["color"] == "Black"]["weight_kg"].mean()
dogs[dogs["color"] == "Brown"]["weight_kg"].mean()
#Grouped summaries
dogs.groupby("color")["weight_kg"].mean()
#Multiple grouped summaries
dogs.groupby("color")["weight_kg"].agg([min, max, sum])
#Grouping by multiple variables
dogs.groupby(["color", "breed"])["weight_kg"].mean()
#Many groups, many summaries
dogs.groupby(["color", "breed"])[["weight_kg", "height_cm"]].mean()
Pivot tables
#pivot table
dogs.pivot_table(values="weight_kg",index="color")
#Different statistics
import numpy as np
dogs.pivot_table(values="weight_kg", index="color", aggfunc=np.median)
#Multiple statistics
dogs.pivot_table(values="weight_kg", index="color", aggfunc=[np.mean, np.median])
#Pivot on two variables
dogs.groupby(["color", "breed"])["weight_kg"].mean()
dogs.pivot_table(values="weight_kg", index="color", columns="breed")
#Filling missing values in pivot tables
dogs.pivot_table(values="weight_kg", index="color", columns="breed", fill_value=0)
# Summing with pivot tables
dogs.pivot_table(values="weight_kg", index="color", columns="breed",
fill_value=0, margins=True)
Explicit indexes
# Setting a column as the index
dogs_ind = dogs.set_index("name")
# Removing an index
dogs_ind.reset_index()
# Dropping an index
dogs_ind.reset_index(drop=True)
# Indexes make subsetting simpler
dogs[dogs["name"].isin(["Bella", "Stella"])]
# versus
dogs_ind.loc[["Bella", "Stella"]]
# Multi-level indexes a.k.a. hierarchical indexes
dogs_ind3 = dogs.set_index(["breed", "color"])
# Subset the outer level with a list
dogs_ind3.loc[["Labrador", "Chihuahua"]]
# Subset inner levels with a list of tuples
dogs_ind3.loc[[("Labrador", "Brown"), ("Chihuahua", "Tan")]]
# Sorting by index values
dogs_ind3.sort_index()
# Controlling sort_index
dogs_ind3.sort_index(level=["color", "breed"], ascending=[True, False])
Slicing and subsetting with .loc and .iloc
# Sort the index before you slice
dogs_srt = dogs.set_index(["breed", "color"]).sort_index()
# Slicing the outer index level
dogs_srt.loc["Chow Chow":"Poodle"]
# Slicing the inner index levels correctly
dogs_srt.loc[("Labrador", "Brown"):("Schnauzer", "Grey")]
# Slicing columns
dogs_srt.loc[:, "name":"height_cm"]
# Slice twice
dogs_srt.loc[
("Labrador", "Brown"):("Schnauzer", "Grey"),
"name":"height_cm"]
# Dog days
dogs = dogs.set_index("date_of_birth").sort_index()
# Slicing by dates
# Get dogs with date_of_birth between 2014-08-25 and 2016-09-16
dogs.loc["2014-08-25":"2016-09-16"]
# Slicing by partial dates
# Get dogs with date_of_birth between 2014-01-01 and 2016-12-31
dogs.loc["2014":"2016"]
# Subsetting by row/column number
print(dogs.iloc[2:5, 1:4])
Working with pivot tables
# Pivoting the dog pack
dogs_height_by_breed_vs_color = dog_pack.pivot_table(
"height_cm", index="breed", columns="color")
# The axis argument
dogs_height_by_breed_vs_color.mean(axis="index")
# Calculating summary stats across columns
dogs_height_by_breed_vs_color.mean(axis="columns")
Visualizing your data
# Histograms
import matplotlib.pyplot as plt
dog_pack["height_cm"].hist(bins=20)
# Bar plots
avg_weight_by_breed = dog_pack.groupby("breed")["weight_kg"].mean()
avg_weight_by_breed.plot(kind="bar", title="Mean Weight by Dog Breed")
# Line plots
sully.head()
sully.plot(x="date", y="weight_kg", kind="line")
# Rotating axis labels
sully.plot(x="date", y="weight_kg", kind="line", rot=45)
# Scatter plots
dog_pack.plot(x="height_cm", y="weight_kg", kind="scatter")
# Layering plots
dog_pack[dog_pack["sex"]=="F"]["height_cm"].hist()
dog_pack[dog_pack["sex"]=="M"]["height_cm"].hist()
# Add a legend
plt.legend(["F", "M"])
# Transparency
dog_pack[dog_pack["sex"]=="F"]["height_cm"].hist(alpha=0.7)
dog_pack[dog_pack["sex"]=="M"]["height_cm"].hist(alpha=0.7)
plt.legend(["F", "M"])
Missing values
# Detecting missing values
dogs.isna()
# Detecting any missing values
dogs.isna().any()
# Counting missing values
dogs.isna().sum()
# Plotting missing values
import matplotlib.pyplot as plt
dogs.isna().sum().plot(kind="bar")
plt.show()
# Removing rows containing missing values
dogs.dropna()
# Replacing missing values
dogs.fillna(0)
Reading and writing CSVs
# CSV to DataFrame
import pandas as pd
new_dogs = pd.read_csv("new_dogs.csv")
# DataFrame to CSV
new_dogs.to_csv("new_dogs_with_bmi.csv")
# CSV to dataframe parsing dates, and having date as index
climate_change = pd.read_csv(prefix+'climate_change.csv', parse_dates=['date'], index_col='date')
Merging multiple DataFrames
# Single merge
grants.merge(licenses, on=['address','zip'])
# Merging multiple tables
grants_licenses_ward = grants.merge(licenses, on=['address','zip']) \
.merge(wards, on='ward', suffixes=('_bus','_ward'))
# Plot Results
import matplotlib.pyplot as plt
grant_licenses_ward.groupby('ward').agg('sum').plot(kind='bar', y='grant')
Merging on indexes
# Setting an index
movies = pd.read_csv('tmdb_movies.csv', index_col=['id'])
# Merging on index
movies_taglines = movies.merge(taglines, on='id', how='left')
# MultiIndex merge
samuel_casts = samuel.merge(casts, on=['movie_id','cast_id'])
# Index merge with left_on and right_on
movies_genres = movies.merge(movie_to_genres, left_on='id', left_index=True,
right_on='movie_id', right_index=True)
Filtering joins
###########
# semi-join
# Step 1 - semi-join
genres_tracks = genres.merge(top_tracks, on='gid')
# Step 2 - semi-join
genres['gid'].isin(genres_tracks['gid'])
# Step 3 - semi-join
genres_tracks = genres.merge(top_tracks, on='gid')
top_genres = genres[genres['gid'].isin(genres_tracks['gid'])]
###########
# anti-join
# Step 1 - anti-join
genres_tracks = genres.merge(top_tracks, on='gid', how='left', indicator=True)
# Step 2 - anti-join
gid_list = genres_tracks.loc[genres_tracks['_merge'] == 'left_only', 'gid']
# Step 3 - anti-join
genres_tracks = genres.merge(top_tracks, on='gid', how='left', indicator=True)
gid_list = genres_tracks.loc[genres_tracks['_merge'] == 'left_only','gid']
non_top_genres = genres[genres['gid'].isin(gid_list)]
Concatenate DataFrames together vertically
# Basic concatenation
pd.concat([inv_jan, inv_feb, inv_mar])
# Ignoring the index
pd.concat([inv_jan, inv_feb, inv_mar],
ignore_index=True)
# Setting labels to original tables
pd.concat([inv_jan, inv_feb, inv_mar],
ignore_index=False,
keys=['jan','feb','mar'])
# Concatenate tables with different column names
pd.concat([inv_jan, inv_feb],
sort=True)
# Concatenate tables with different column names
pd.concat([inv_jan, inv_feb],
join='inner')
# Append the tables
inv_jan.append([inv_feb, inv_mar],
ignore_index=True,
sort=True)
Verifying integrity
# Validating merges
.merge(validate=None) :
Checks if merge is of specified type
'one_to_one'
'one_to_many'
'many_to_one'
'many_to_many'
# Merge validate: one_to_one
tracks.merge(specs, on='tid',
validate='one_to_one')
# Merge validate: one_to_many
albums.merge(tracks, on='aid',
validate='one_to_many')
# Verifying concatenations
.concat(verify_integrity=False) :
Check whether the new concatenated index contains duplicates
Default value is False
Selecting data with .query()
# Querying on a single condition
stocks.query('nike >= 90')
# Querying on a multiple conditions, "and", "or"
stocks.query('nike > 90 and disney < 140')
stocks.query('nike > 96 or disney < 98')
# Using .query() to select text
stocks_long.query('stock=="disney" or (stock=="nike" and close < 90)')
Reshaping data with .melt()
# Example of .melt()
social_fin_tall = social_fin.melt(id_vars=['financial','company'])
# Melting with value_vars
social_fin_tall = social_fin.melt(id_vars=['financial','company'],
value_vars=['2018','2017'])
# Melting with column names
social_fin_tall = social_fin.melt(id_vars=['financial','company'],
value_vars=['2018','2017'],
var_name=['year'], value_name='dollars')