Introduction to Data Visualization with Seaborn

pdf lectures in github

Introduction to Seaborn

Introduction to Seaborn

# Getting started
import seaborn as sns
import matplotlib.pyplot as plt

# Example 1: Scatter plot
import seaborn as sns
import matplotlib.pyplot as plt
height = [62, 64, 69, 75, 66, 68, 65, 71, 76, 73]
weight = [120, 136, 148, 175, 137, 165, 154, 172, 200, 187]
sns.scatterplot(x=height, y=weight)
plt.show()

# Example 2: Create a count plot
import seaborn as sns
import matplotlib.pyplot as plt
gender = ["Female", "Female", "Female", "Female", "Male", "Male", "Male", "Male", "Male", "Male"]
sns.countplot(x=gender)
plt.show()

Using pandas with Seaborn

# Using DataFrames with countplot()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("masculinity.csv")
sns.countplot(x="how_masculine", data=df)
plt.show()

Adding a third variable with hue

# Tips dataset
import pandas as pd
import seaborn as sns
tips = sns.load_dataset("tips")
tips.head()

# A basic scatter plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot(x="total_bill", y="tip", data=tips)
plt.show()

# A scatter plot with hue
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot(x="total_bill", y="tip", data=tips, hue="smoker")
plt.show()

# Setting hue order
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot(x="total_bill", y="tip", data=tips, hue="smoker", hue_order=["Yes","No"])
plt.show()

# Specifying hue colors
import matplotlib.pyplot as plt
import seaborn as sns
hue_colors = {"Yes": "black", "No": "red"}
sns.scatterplot(x="total_bill", y="tip", data=tips, hue="smoker", palette=hue_colors)
plt.show()

# Using HTML hex color codes with hue
import matplotlib.pyplot as plt
import seaborn as sns
hue_colors = {"Yes": "#808080", "No": "#00FF00"}
sns.scatterplot(x="total_bill", y="tip", data=tips, hue="smoker", palette=hue_colors)
plt.show()

# Using hue with count plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(x="smoker", data=tips, hue="sex")
plt.show()

Visualizing Two Quantitative Variables

Introduction to relational plots and subplots

# Using relplot()
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill", y="tip", data=tips, kind="scatter")
plt.show()

# Subplots in columns
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill", y="tip", data=tips, kind="scatter", col="smoker")
plt.show()

# Subplots in rows
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill", y="tip", data=tips, kind="scatter", row="smoker")
plt.show()

# Subplots in rows and columns
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill", y="tip", data=tips, kind="scatter", col="smoker", row="time")
plt.show()

# Wrapping columns
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill",y="tip",data=tips,kind="scatter",col="day",col_wrap=2)
plt.show()

# Ordering columns
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill",y="tip",data=tips,kind="scatter",col="day",col_wrap=2,col_order=["Thur","Fri","Sat","Sun"])
plt.show()

Customizing scatter plots

# Subgroups with point size
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill",y="tip",data=tips,kind="scatter",size="size")
plt.show()

# Point size and hue
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill",y="tip",data=tips,kind="scatter",size="size",hue="size")
plt.show()

# Subgroups with point style
import seaborn as sns
import matplotlib.pyplot as plt
sns.relplot(x="total_bill",y="tip",data=tips,kind="scatter",hue="smoker",style="smoker")
plt.show()

# Changing point transparency
import seaborn as sns
import matplotlib.pyplot as plt
# Set alpha to be between 0 and 1
sns.relplot(x="total_bill",y="tip",data=tips,kind="scatter",alpha=0.4)
plt.show()

Introduction to line plots

# Line plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x="hour", y="NO_2_mean",data=air_df_mean,kind="line")
plt.show()

# Subgroups by location
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x="hour", y="NO_2_mean",data=air_df_loc_mean,kind="line",style="location",hue="location")
plt.show()

# Adding markers
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x="hour", y="NO_2_mean",data=air_df_loc_mean,kind="line",style="location",hue="location",markers=True)
plt.show()

# Turning off line style
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x="hour", y="NO_2_mean",data=air_df_loc_mean,kind="line",style="location",hue="location",markers=True,dashes=False)
plt.show()

# Multiple observations per x-value
# Line plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x="hour", y="NO_2",data=air_df,kind="line")
plt.show()

# Replacing confidence interval with standard deviation
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x="hour", y="NO_2",data=air_df,kind="line",ci="sd")
plt.show()

# Turning off confidence interval
import matplotlib.pyplot as plt
import seaborn as sns
sns.relplot(x="hour", y="NO_2",data=air_df,kind="line",ci=None)
plt.show()

Visualizing a Categorical and a Quantitative Variable

Count plots and bar plots

# countplot() vs. catplot()
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="how_masculine",data=masculinity_data,kind="count")
plt.show()

# Changing the order
import matplotlib.pyplot as plt
import seaborn as sns
category_order = ["No answer","Not at all","Not very","Somewhat","Very"]
sns.catplot(x="how_masculine",data=masculinity_data,kind="count",order=category_order)
plt.show()

# Bar plots
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="day",y="total_bill",data=tips,kind="bar")
plt.show()

# Turning off confidence intervals
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="day",y="total_bill",data=tips,kind="bar",ci=None)
plt.show()

# Changing the orientation
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="total_bill",y="day",data=tips,kind="bar")
plt.show()

Box plots

# How to create a box plot
import matplotlib.pyplot as plt
import seaborn as sns
g = sns.catplot(x="time",y="total_bill",data=tips,kind="box")
plt.show()

# Change the order of categories
import matplotlib.pyplot as plt
import seaborn as sns
g = sns.catplot(x="time",y="total_bill",data=tips,kind="box",order=["Dinner","Lunch"])
plt.show()

# Omitting the outliers using `sym`
import matplotlib.pyplot as plt
import seaborn as sns
g = sns.catplot(x="time",y="total_bill",data=tips,kind="box",sym="")
plt.show()

# Changing the whiskers using `whis`
import matplotlib.pyplot as plt
import seaborn as sns
g = sns.catplot(x="time",y="total_bill",data=tips,kind="box",whis=[0, 100])
plt.show()

Point plots

# Creating a point plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="age",y="masculinity_important",data=masculinity_data,hue="feel_masculine",kind="point")
plt.show()

# Disconnecting the points
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="age",y="masculinity_important",data=masculinity_data,hue="feel_masculine",kind="point",join=False)
plt.show()

# Displaying the median
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import median
sns.catplot(x="smoker",y="total_bill",data=tips,kind="point",estimator=median)
plt.show()

# Customizing the confidence intervals
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="smoker",y="total_bill",data=tips,kind="point",capsize=0.2)
plt.show()

# Turning off confidence intervals
import matplotlib.pyplot as plt
import seaborn as sns
sns.catplot(x="smoker",y="total_bill",data=tips,kind="point",ci=None)
plt.show()

Customizing Seaborn Plots

Changing plot style and color

# Figure style: "whitegrid"
sns.set_style("whitegrid")
sns.catplot(x="age",y="masculinity_important",data=masculinity_data,hue="feel_masculine",kind="point")
plt.show()

# Other styles:
sns.set_style("ticks")
sns.set_style("dark")
sns.set_style("darkgrid")

# Example (diverging palette)
sns.set_palette("RdBu")
category_order = ["No answer","Not at all","Not very","Somewhat","Very"]
sns.catplot(x="how_masculine",data=masculinity_data,kind="count",order=category_order)
plt.show()

# Custom palettes
custom_palette = ["red", "green", "orange", "blue","yellow", "purple"]
sns.set_palette(custom_palette)

# Custom palettes
custom_palette = ['#FBB4AE', '#B3CDE3', '#CCEBC5','#DECBE4', '#FED9A6', '#FFFFCC','#E5D8BD', '#FDDAEC', '#F2F2F2']
sns.set_palette(custom_palette)

# Larger context: "talk"
#Smallest to largest: "paper", "notebook", "talk", "poster"
sns.set_context("talk")

Adding titles and labels: Part 1

# Adding a title to FacetGrid
g = sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
g.fig.suptitle("New Title")
plt.show()

# Adjusting height of title in FacetGrid
g = sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
g.fig.suptitle("New Title",y=1.03)
plt.show()

Adding titles and labels: Part 2

# Adding a title to AxesSubplot
g = sns.boxplot(x="Region",y="Birthrate",data=gdp_data)
g.set_title("New Title",y=1.03)

# Titles for subplots
g = sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box",col="Group")
g.fig.suptitle("New Title",y=1.03)
g.set_titles("This is {col_name}")

# Adding axis labels
g = sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
g.set(xlabel="New X Label",ylabel="New Y Label")
plt.show()

# Rotating x-axis tick labels
g = sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
plt.xticks(rotation=90)
plt.show()

Intermediate Data Visualization with Seaborn

pdf lectures in github

Seaborn Introduction

Introduction to Seaborn

# Seaborn distplot 
import seaborn as sns
sns.distplot(df['alcohol'])

Using the distribution plot

# Creating a histogram
sns.distplot(df['alcohol'], kde=False, bins=10)

# Alternative data distributions
sns.distplot(df['alcohol'], hist=False, rug=True)

# Further Customizations
sns.distplot(df['alcohol'], hist=False,rug=True, kde_kws={'shade':True})

Regression Plots in Seaborn

# Introduction to regplot
sns.regplot(x="alcohol", y="pH", data=df)

# lmplot faceting
sns.lmplot(x="quality", y="alcohol",data=df, hue="type") 
sns.lmplot(x="quality", y="alcohol",data=df, col="type")

Customizing Seaborn Plots

Using Seaborn Styles

# Setting Styles
# Seaborn has default configurations that can be applied with sns.set()
# These styles can override matplotlib and pandas plots as well
sns.set()

# Theme examples with sns.set_style()
for style in ['white','dark','whitegrid','darkgrid','ticks']:
    sns.set_style(style)
    sns.distplot(df['Tuition'])
    plt.show()

# Removing axes with despine()
sns.set_style('white')
sns.distplot(df['Tuition'])
sns.despine(left=True)

Colors in Seaborn

# Defining a color for a plot
sns.set(color_codes=True)
sns.distplot(df['Tuition'], color='g')

# Palettes
for p in sns.palettes.SEABORN_PALETTES:
    sns.set_palette(p)
    sns.distplot(df['Tuition'])

# Displaying Palettes
for p in sns.palettes.SEABORN_PALETTES:
    sns.set_palette(p)
    sns.palplot(sns.color_palette())
    plt.show()

# Defining Custom Palettes
# Circular colors = when the data is not ordered 
sns.palplot(sns.color_palette("Paired", 12))

# Sequential colors = when the data has a consistent range from high to low
sns.palplot(sns.color_palette("Blues", 12))

# Diverging colors = when both the low and high values are interesting
sns.palplot(sns.color_palette("BrBG", 12))

Customizing with matplotlib

# Matplotlib Axes
fig, ax = plt.subplots()
sns.distplot(df['Tuition'], ax=ax)
ax.set(xlabel="Tuition 2013-14")

# Further Customizations
fig, ax = plt.subplots()
sns.distplot(df['Tuition'], ax=ax)
ax.set(xlabel="Tuition 2013-14",ylabel="Distribution", xlim=(0, 50000),title="2013-14 Tuition and Fees Distribution")

# Combining Plots
fig, (ax0, ax1) = plt.subplots(nrows=1,ncols=2, sharey=True, figsize=(7,4))
sns.distplot(df['Tuition'], ax=ax0)
sns.distplot(df.query('State == "MN"')['Tuition'], ax=ax1)
ax1.set(xlabel="Tuition (MN)", xlim=(0, 70000))
ax1.axvline(x=20000, label='My Budget', linestyle='--')
ax1.legend()

Additional Plot Types

Categorical Plot Types

# Plots of each observation - stripplot
sns.stripplot(data=df, y="DRG Definition",
x="Average Covered Charges",
jitter=True)

# Plots of each observation - swarmplot
sns.swarmplot(data=df, y="DRG Definition",
x="Average Covered Charges")

# Abstract representations - boxplot
sns.boxplot(data=df, y="DRG Definition",
x="Average Covered Charges")

# Abstract representation - violinplot
sns.violinplot(data=df, y="DRG Definition",
x="Average Covered Charges")

# Abstract representation - lvplot
sns.lvplot(data=df, y="DRG Definition",
x="Average Covered Charges")

# Statistical estimates - barplot
sns.barplot(data=df, y="DRG Definition",
x="Average Covered Charges",
hue="Region")

# Statistical estimates - pointplot
sns.pointplot(data=df, y="DRG Definition",
x="Average Covered Charges",
hue="Region")

# Statistical estimates - countplot
sns.countplot(data=df, y="DRG_Code", hue="Region")

Regression Plots

# Plotting with regplot()
sns.regplot(data=df, x='temp',
y='total_rentals', marker='+')

# Evaluating regression with residplot()
sns.residplot(data=df, x='temp', y='total_rentals')

# Polynomial regression
sns.regplot(data=df, x='temp',
y='total_rentals', order=2)

# residplot with polynomial regression
sns.residplot(data=df, x='temp',
y='total_rentals', order=2)

# Categorical values
sns.regplot(data=df, x='mnth', y='total_rentals',
x_jitter=.1, order=2)

# Estimators
sns.regplot(data=df, x='mnth', y='total_rentals',
x_estimator=np.mean, order=2)

# Binning the data
sns.regplot(data=df,x='temp',y='total_rentals',
x_bins=4)

Matrix plots

# Getting data in the right format
pd.crosstab(df["mnth"], df["weekday"],
values=df["total_rentals"],aggfunc='mean').round(0)

# Build a heatmap
sns.heatmap(pd.crosstab(df["mnth"], df["weekday"],
values=df["total_rentals"], aggfunc='mean')
)

# Customize a heatmap
sns.heatmap(df_crosstab, annot=True, fmt="d",
cmap="YlGnBu", cbar=False, linewidths=.5)

# Centering a heatmap
sns.heatmap(df_crosstab, annot=True, fmt="d",
cmap="YlGnBu", cbar=True,
center=df_crosstab.loc[9, 6])

# Plotting a correlation matrix
sns.heatmap(df.corr())

Creating Plots on Data Aware Grids

Using FacetGrid, factorplot and lmplot

# FacetGrid Categorical Example
g = sns.FacetGrid(df, col="HIGHDEG")
g.map(sns.boxplot, 'Tuition',
order=['1', '2', '3', '4'])

# factorplot()
sns.factorplot(x="Tuition", data=df,
col="HIGHDEG", kind='box')

# FacetGrid for regression
# FacetGrid() can also be used for sca er or regression plots
g = sns.FacetGrid(df, col="HIGHDEG")
g.map(plt.scatter, 'Tuition', 'SAT_AVG_ALL')


# lmplot
# lmplot plots sca er and regression plots on a FacetGrid
sns.lmplot(data=df, x="Tuition", y="SAT_AVG_ALL",
col="HIGHDEG", fit_reg=False)

# lmplot with regression
sns.lmplot(data=df, x="Tuition", y="SAT_AVG_ALL",
col="HIGHDEG", row='REGION')

Using PairGrid and pairplot

# Creating a PairGrid
g = sns.PairGrid(df, vars=["Fair_Mrkt_Rent", "Median_Income"])
g = g.map(plt.scatter)

# Customizing the PairGrid diagonals
g = sns.PairGrid(df, vars=["Fair_Mrkt_Rent", "Median_Income"])
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter)

# Pairplot
sns.pairplot(df, vars=["Fair_Mrkt_Rent", "Median_Income"], kind='reg', diag_kind='hist')

# Customizing a pairplot
sns.pairplot(df.query('BEDRMS < 3'),vars=["Fair_Mrkt_Rent","Median_Income", "UTILITY"],hue='BEDRMS', palette='husl', plot_kws={'alpha': 0.5})

Using JointGrid and jointplot

# Basic JointGrid
g = sns.JointGrid(data=df, x="Tuition",y="ADM_RATE_ALL")
g.plot(sns.regplot, sns.distplot)

# Advanced JointGrid
g = sns.JointGrid(data=df, x="Tuition",y="ADM_RATE_ALL")
g = g.plot_joint(sns.kdeplot)
g = g.plot_marginals(sns.kdeplot, shade=True)
g = g.annotate(stats.pearsonr)

# jointplot()
sns.jointplot(data=df, x="Tuition",y="ADM_RATE_ALL", kind='hex')

# Customizing a jointplot
g = (sns.jointplot(x="Tuition",
                   y="ADM_RATE_ALL", kind='scatter',
                   xlim=(0, 25000),
                   marginal_kws=dict(bins=15,rug=True),
                   data=df.query('UG < 2500 & Ownership == "Public"'))
     .plot_joint(sns.kdeplot))