Data Manipulation with pandas

Transforming Data

Introducing DataFrames

# Exploring a DataFrame: .head()
dogs.head()

# Exploring a DataFrame: .info()
dogs.info()

# Exploring a DataFrame: .shape
dogs.shape

# Exploring a DataFrame: .describe()
dogs.describe()

# Components of a DataFrame: .values
dogs.values

# Components of a DataFrame: .columns and .index
dogs.columns

dogs.index

Sorting and subsetting

#Sorting by multiple variables
dogs.sort_values(["weight_kg", "height_cm"], ascending=[True, False])

#Subsetting based on dates
dogs[dogs["date_of_birth"] > "2015-01-01"]

#Subsetting based on multiple conditions
is_lab = dogs["breed"] == "Labrador"
is_brown = dogs["color"] == "Brown"
dogs[is_lab & is_brown]
dogs[ (dogs["breed"] == "Labrador") & (dogs["color"] == "Brown") ]

#Subsetting using .isin()
is_black_or_brown = dogs["color"].isin(["Black", "Brown"])
dogs[is_black_or_brown]

New columns

# Adding a new column
dogs["height_m"] = dogs["height_cm"] / 100

Aggregating Data

Summary statistics

#Summarizing numerical data
dogs["height_cm"].mean()

.median() , .mode()
.min() , .max()
.var() , .std()
.sum()
.quantile()

#The .agg() method
def pct30(column):
return column.quantile(0.3)
dogs["weight_kg"].agg(pct30)

#Multiple summaries
def pct40(column):
return column.quantile(0.4)
dogs["weight_kg"].agg([pct30, pct40])

#Cumulative sum
dogs["weight_kg"].cumsum()

#Cumulative statistics
.cummax()
.cummin()
.cumprod()

Counting

#Dropping duplicate names
vet_visits.drop_duplicates(subset="name")

#Dropping duplicate pairs
unique_dogs = vet_visits.drop_duplicates(subset=["name", "breed"])

#Counting
unique_dogs["breed"].value_counts(sort=True)

Grouped summary statistics


#Summaries by group
dogs[dogs["color"] == "Black"]["weight_kg"].mean()
dogs[dogs["color"] == "Brown"]["weight_kg"].mean()

#Grouped summaries
dogs.groupby("color")["weight_kg"].mean()

#Multiple grouped summaries
dogs.groupby("color")["weight_kg"].agg([min, max, sum])

#Grouping by multiple variables
dogs.groupby(["color", "breed"])["weight_kg"].mean()

#Many groups, many summaries
dogs.groupby(["color", "breed"])[["weight_kg", "height_cm"]].mean()

Pivot tables


#pivot table
dogs.pivot_table(values="weight_kg",index="color")

#Different statistics
import numpy as np
dogs.pivot_table(values="weight_kg", index="color", aggfunc=np.median)

#Multiple statistics
dogs.pivot_table(values="weight_kg", index="color", aggfunc=[np.mean, np.median])

#Pivot on two variables
dogs.groupby(["color", "breed"])["weight_kg"].mean()
dogs.pivot_table(values="weight_kg", index="color", columns="breed")

#Filling missing values in pivot tables
dogs.pivot_table(values="weight_kg", index="color", columns="breed", fill_value=0)

# Summing with pivot tables
dogs.pivot_table(values="weight_kg", index="color", columns="breed",
fill_value=0, margins=True)

Slicing and Indexing

Explicit indexes


# Setting a column as the index
dogs_ind = dogs.set_index("name")

# Removing an index
dogs_ind.reset_index()

# Dropping an index
dogs_ind.reset_index(drop=True)

# Indexes make subsetting simpler
dogs[dogs["name"].isin(["Bella", "Stella"])]
# versus
dogs_ind.loc[["Bella", "Stella"]]

# Multi-level indexes a.k.a. hierarchical indexes
dogs_ind3 = dogs.set_index(["breed", "color"])

# Subset the outer level with a list
dogs_ind3.loc[["Labrador", "Chihuahua"]]

# Subset inner levels with a list of tuples
dogs_ind3.loc[[("Labrador", "Brown"), ("Chihuahua", "Tan")]]

# Sorting by index values
dogs_ind3.sort_index()

# Controlling sort_index
dogs_ind3.sort_index(level=["color", "breed"], ascending=[True, False])

Slicing and subsetting with .loc and .iloc


# Sort the index before you slice
dogs_srt = dogs.set_index(["breed", "color"]).sort_index()

# Slicing the outer index level
dogs_srt.loc["Chow Chow":"Poodle"]

# Slicing the inner index levels correctly
dogs_srt.loc[("Labrador", "Brown"):("Schnauzer", "Grey")]

# Slicing columns
dogs_srt.loc[:, "name":"height_cm"]

# Slice twice
dogs_srt.loc[
("Labrador", "Brown"):("Schnauzer", "Grey"),
"name":"height_cm"]

# Dog days
dogs = dogs.set_index("date_of_birth").sort_index()

# Slicing by dates
# Get dogs with date_of_birth between 2014-08-25 and 2016-09-16
dogs.loc["2014-08-25":"2016-09-16"]

# Slicing by partial dates
# Get dogs with date_of_birth between 2014-01-01 and 2016-12-31
dogs.loc["2014":"2016"]

# Subsetting by row/column number
print(dogs.iloc[2:5, 1:4])

Working with pivot tables


# Pivoting the dog pack
dogs_height_by_breed_vs_color = dog_pack.pivot_table(
"height_cm", index="breed", columns="color")

# The axis argument
dogs_height_by_breed_vs_color.mean(axis="index")

# Calculating summary stats across columns
dogs_height_by_breed_vs_color.mean(axis="columns")

Creating and Visualizing DataFrames

Visualizing your data


# Histograms
import matplotlib.pyplot as plt
dog_pack["height_cm"].hist(bins=20)

# Bar plots
avg_weight_by_breed = dog_pack.groupby("breed")["weight_kg"].mean()
avg_weight_by_breed.plot(kind="bar", title="Mean Weight by Dog Breed")

# Line plots
sully.head()
sully.plot(x="date", y="weight_kg", kind="line")

# Rotating axis labels
sully.plot(x="date", y="weight_kg", kind="line", rot=45)

# Scatter plots
dog_pack.plot(x="height_cm", y="weight_kg", kind="scatter")

# Layering plots
dog_pack[dog_pack["sex"]=="F"]["height_cm"].hist()
dog_pack[dog_pack["sex"]=="M"]["height_cm"].hist()

# Add a legend
plt.legend(["F", "M"])

# Transparency
dog_pack[dog_pack["sex"]=="F"]["height_cm"].hist(alpha=0.7)
dog_pack[dog_pack["sex"]=="M"]["height_cm"].hist(alpha=0.7)
plt.legend(["F", "M"])

Missing values


# Detecting missing values
dogs.isna()

# Detecting any missing values
dogs.isna().any()

# Counting missing values
dogs.isna().sum()

# Plotting missing values
import matplotlib.pyplot as plt
dogs.isna().sum().plot(kind="bar")
plt.show()

# Removing rows containing missing values
dogs.dropna()

# Replacing missing values
dogs.fillna(0)

Reading and writing CSVs


# CSV to DataFrame
import pandas as pd
new_dogs = pd.read_csv("new_dogs.csv")

# DataFrame to CSV
new_dogs.to_csv("new_dogs_with_bmi.csv")

# CSV to dataframe parsing dates, and having date as index
climate_change = pd.read_csv(prefix+'climate_change.csv', parse_dates=['date'], index_col='date')

Joining data with pandas

Data merging basics

Inner join

# Inner join
wards_census = wards.merge(census, on='ward')

# Suffixes
wards_census = wards.merge(census, on='ward', suffixes=('_ward','_cen'))

One-to-many relationships


# One-to-many example
ward_licenses = wards.merge(licenses, on='ward', suffixes=('_ward','_lic'))

Merging multiple DataFrames

# Single merge
grants.merge(licenses, on=['address','zip'])

# Merging multiple tables
grants_licenses_ward = grants.merge(licenses, on=['address','zip']) \
.merge(wards, on='ward', suffixes=('_bus','_ward'))


# Plot Results
import matplotlib.pyplot as plt
grant_licenses_ward.groupby('ward').agg('sum').plot(kind='bar', y='grant')

Merging Tables With Different Join Types

Left join


# Merge with left join
movies_taglines = movies.merge(taglines, on='id', how='left')

Other joins


# Merge with right join
tv_movies = movies.merge(tv_genre, how='right',
left_on='id', right_on='movie_id')

# Merge with outer join
family_comedy = family.merge(comedy, on='movie_id', how='outer',
suffixes=('_fam', '_com'))

Merging a table to itself


# Merging a table to itself
original_sequels = sequels.merge(sequels, left_on='sequel', right_on='id',
suffixes=('_org','_seq'))

Merging on indexes


# Setting an index
movies = pd.read_csv('tmdb_movies.csv', index_col=['id'])

# Merging on index
movies_taglines = movies.merge(taglines, on='id', how='left')

# MultiIndex merge
samuel_casts = samuel.merge(casts, on=['movie_id','cast_id'])

# Index merge with left_on and right_on
movies_genres = movies.merge(movie_to_genres, left_on='id', left_index=True,
right_on='movie_id', right_index=True)

Advanced Merging and Concatenating

Filtering joins


###########
# semi-join

# Step 1 - semi-join
genres_tracks = genres.merge(top_tracks, on='gid')

# Step 2 - semi-join
genres['gid'].isin(genres_tracks['gid'])

# Step 3 - semi-join
genres_tracks = genres.merge(top_tracks, on='gid')
top_genres = genres[genres['gid'].isin(genres_tracks['gid'])]

###########
# anti-join

# Step 1 - anti-join
genres_tracks = genres.merge(top_tracks, on='gid', how='left', indicator=True)

# Step 2 - anti-join
gid_list = genres_tracks.loc[genres_tracks['_merge'] == 'left_only', 'gid']

# Step 3 - anti-join
genres_tracks = genres.merge(top_tracks, on='gid', how='left', indicator=True)
gid_list = genres_tracks.loc[genres_tracks['_merge'] == 'left_only','gid']
non_top_genres = genres[genres['gid'].isin(gid_list)]

Concatenate DataFrames together vertically

# Basic concatenation
pd.concat([inv_jan, inv_feb, inv_mar])

# Ignoring the index
pd.concat([inv_jan, inv_feb, inv_mar],
ignore_index=True)

# Setting labels to original tables
pd.concat([inv_jan, inv_feb, inv_mar],
ignore_index=False,
keys=['jan','feb','mar'])

# Concatenate tables with different column names
pd.concat([inv_jan, inv_feb],
sort=True)

# Concatenate tables with different column names
pd.concat([inv_jan, inv_feb],
join='inner')

# Append the tables
inv_jan.append([inv_feb, inv_mar],
ignore_index=True, 
sort=True)

Verifying integrity


# Validating merges
.merge(validate=None) :
Checks if merge is of specified type
'one_to_one'
'one_to_many'
'many_to_one'
'many_to_many'

# Merge validate: one_to_one
tracks.merge(specs, on='tid',
validate='one_to_one')

# Merge validate: one_to_many
albums.merge(tracks, on='aid',
validate='one_to_many')

# Verifying concatenations
.concat(verify_integrity=False) :
Check whether the new concatenated index contains duplicates
Default value is False

Merging Ordered and Time-Series Data

Using merge_ordered()

# Merging stock data
import pandas as pd
pd.merge_ordered(appl, mcd, on='date', suffixes=('_aapl','_mcd'))

# Forward fill example
pd.merge_ordered(appl, mcd, on='date',
suffixes=('_aapl','_mcd'),
fill_method='ffill')

Using merge_asof()


# merge_asof() example
pd.merge_asof(visa, ibm, on='date_time',
suffixes=('_visa','_ibm'))

# merge_asof() example with direction
pd.merge_asof(visa, ibm, on=['date_time'],
suffixes=('_visa','_ibm'),
direction='forward')

Selecting data with .query()


# Querying on a single condition
stocks.query('nike >= 90')

# Querying on a multiple conditions, "and", "or"
stocks.query('nike > 90 and disney < 140')
stocks.query('nike > 96 or disney < 98')

# Using .query() to select text
stocks_long.query('stock=="disney" or (stock=="nike" and close < 90)')

Reshaping data with .melt()


# Example of .melt()
social_fin_tall = social_fin.melt(id_vars=['financial','company'])

# Melting with value_vars
social_fin_tall = social_fin.melt(id_vars=['financial','company'],
value_vars=['2018','2017'])

# Melting with column names
social_fin_tall = social_fin.melt(id_vars=['financial','company'],
value_vars=['2018','2017'],
var_name=['year'], value_name='dollars')