Introduction to Data Visualization with Seaborn
Introduction to Seaborn
Introduction to Seaborn
# Getting started
import seaborn as sns
import matplotlib.pyplot as plt
# Example 1: Scatter plot
import seaborn as sns
import matplotlib.pyplot as plt
= [62, 64, 69, 75, 66, 68, 65, 71, 76, 73]
height = [120, 136, 148, 175, 137, 165, 154, 172, 200, 187]
weight =height, y=weight)
sns.scatterplot(x
plt.show()
# Example 2: Create a count plot
import seaborn as sns
import matplotlib.pyplot as plt
= ["Female", "Female", "Female", "Female", "Male", "Male", "Male", "Male", "Male", "Male"]
gender =gender)
sns.countplot(x plt.show()
Using pandas with Seaborn
# Using DataFrames with countplot()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
= pd.read_csv("masculinity.csv")
df ="how_masculine", data=df)
sns.countplot(x plt.show()
Adding a third variable with hue
# Tips dataset
import pandas as pd
import seaborn as sns
= sns.load_dataset("tips")
tips
tips.head()
# A basic scatter plot
import matplotlib.pyplot as plt
import seaborn as sns
="total_bill", y="tip", data=tips)
sns.scatterplot(x
plt.show()
# A scatter plot with hue
import matplotlib.pyplot as plt
import seaborn as sns
="total_bill", y="tip", data=tips, hue="smoker")
sns.scatterplot(x
plt.show()
# Setting hue order
import matplotlib.pyplot as plt
import seaborn as sns
="total_bill", y="tip", data=tips, hue="smoker", hue_order=["Yes","No"])
sns.scatterplot(x
plt.show()
# Specifying hue colors
import matplotlib.pyplot as plt
import seaborn as sns
= {"Yes": "black", "No": "red"}
hue_colors ="total_bill", y="tip", data=tips, hue="smoker", palette=hue_colors)
sns.scatterplot(x
plt.show()
# Using HTML hex color codes with hue
import matplotlib.pyplot as plt
import seaborn as sns
= {"Yes": "#808080", "No": "#00FF00"}
hue_colors ="total_bill", y="tip", data=tips, hue="smoker", palette=hue_colors)
sns.scatterplot(x
plt.show()
# Using hue with count plots
import matplotlib.pyplot as plt
import seaborn as sns
="smoker", data=tips, hue="sex")
sns.countplot(x
plt.show()
Visualizing Two Quantitative Variables
Introduction to relational plots and subplots
# Using relplot()
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill", y="tip", data=tips, kind="scatter")
sns.relplot(x
plt.show()
# Subplots in columns
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill", y="tip", data=tips, kind="scatter", col="smoker")
sns.relplot(x
plt.show()
# Subplots in rows
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill", y="tip", data=tips, kind="scatter", row="smoker")
sns.relplot(x
plt.show()
# Subplots in rows and columns
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill", y="tip", data=tips, kind="scatter", col="smoker", row="time")
sns.relplot(x
plt.show()
# Wrapping columns
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill",y="tip",data=tips,kind="scatter",col="day",col_wrap=2)
sns.relplot(x
plt.show()
# Ordering columns
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill",y="tip",data=tips,kind="scatter",col="day",col_wrap=2,col_order=["Thur","Fri","Sat","Sun"])
sns.relplot(x plt.show()
Customizing scatter plots
# Subgroups with point size
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill",y="tip",data=tips,kind="scatter",size="size")
sns.relplot(x
plt.show()
# Point size and hue
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill",y="tip",data=tips,kind="scatter",size="size",hue="size")
sns.relplot(x
plt.show()
# Subgroups with point style
import seaborn as sns
import matplotlib.pyplot as plt
="total_bill",y="tip",data=tips,kind="scatter",hue="smoker",style="smoker")
sns.relplot(x
plt.show()
# Changing point transparency
import seaborn as sns
import matplotlib.pyplot as plt
# Set alpha to be between 0 and 1
="total_bill",y="tip",data=tips,kind="scatter",alpha=0.4)
sns.relplot(x plt.show()
Introduction to line plots
# Line plot
import matplotlib.pyplot as plt
import seaborn as sns
="hour", y="NO_2_mean",data=air_df_mean,kind="line")
sns.relplot(x
plt.show()
# Subgroups by location
import matplotlib.pyplot as plt
import seaborn as sns
="hour", y="NO_2_mean",data=air_df_loc_mean,kind="line",style="location",hue="location")
sns.relplot(x
plt.show()
# Adding markers
import matplotlib.pyplot as plt
import seaborn as sns
="hour", y="NO_2_mean",data=air_df_loc_mean,kind="line",style="location",hue="location",markers=True)
sns.relplot(x
plt.show()
# Turning off line style
import matplotlib.pyplot as plt
import seaborn as sns
="hour", y="NO_2_mean",data=air_df_loc_mean,kind="line",style="location",hue="location",markers=True,dashes=False)
sns.relplot(x
plt.show()
# Multiple observations per x-value
# Line plot
import matplotlib.pyplot as plt
import seaborn as sns
="hour", y="NO_2",data=air_df,kind="line")
sns.relplot(x
plt.show()
# Replacing confidence interval with standard deviation
import matplotlib.pyplot as plt
import seaborn as sns
="hour", y="NO_2",data=air_df,kind="line",ci="sd")
sns.relplot(x
plt.show()
# Turning off confidence interval
import matplotlib.pyplot as plt
import seaborn as sns
="hour", y="NO_2",data=air_df,kind="line",ci=None)
sns.relplot(x
plt.show()
Visualizing a Categorical and a Quantitative Variable
Count plots and bar plots
# countplot() vs. catplot()
import matplotlib.pyplot as plt
import seaborn as sns
="how_masculine",data=masculinity_data,kind="count")
sns.catplot(x
plt.show()
# Changing the order
import matplotlib.pyplot as plt
import seaborn as sns
= ["No answer","Not at all","Not very","Somewhat","Very"]
category_order ="how_masculine",data=masculinity_data,kind="count",order=category_order)
sns.catplot(x
plt.show()
# Bar plots
import matplotlib.pyplot as plt
import seaborn as sns
="day",y="total_bill",data=tips,kind="bar")
sns.catplot(x
plt.show()
# Turning off confidence intervals
import matplotlib.pyplot as plt
import seaborn as sns
="day",y="total_bill",data=tips,kind="bar",ci=None)
sns.catplot(x
plt.show()
# Changing the orientation
import matplotlib.pyplot as plt
import seaborn as sns
="total_bill",y="day",data=tips,kind="bar")
sns.catplot(x plt.show()
Box plots
# How to create a box plot
import matplotlib.pyplot as plt
import seaborn as sns
= sns.catplot(x="time",y="total_bill",data=tips,kind="box")
g
plt.show()
# Change the order of categories
import matplotlib.pyplot as plt
import seaborn as sns
= sns.catplot(x="time",y="total_bill",data=tips,kind="box",order=["Dinner","Lunch"])
g
plt.show()
# Omitting the outliers using `sym`
import matplotlib.pyplot as plt
import seaborn as sns
= sns.catplot(x="time",y="total_bill",data=tips,kind="box",sym="")
g
plt.show()
# Changing the whiskers using `whis`
import matplotlib.pyplot as plt
import seaborn as sns
= sns.catplot(x="time",y="total_bill",data=tips,kind="box",whis=[0, 100])
g plt.show()
Point plots
# Creating a point plot
import matplotlib.pyplot as plt
import seaborn as sns
="age",y="masculinity_important",data=masculinity_data,hue="feel_masculine",kind="point")
sns.catplot(x
plt.show()
# Disconnecting the points
import matplotlib.pyplot as plt
import seaborn as sns
="age",y="masculinity_important",data=masculinity_data,hue="feel_masculine",kind="point",join=False)
sns.catplot(x
plt.show()
# Displaying the median
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import median
="smoker",y="total_bill",data=tips,kind="point",estimator=median)
sns.catplot(x
plt.show()
# Customizing the confidence intervals
import matplotlib.pyplot as plt
import seaborn as sns
="smoker",y="total_bill",data=tips,kind="point",capsize=0.2)
sns.catplot(x
plt.show()
# Turning off confidence intervals
import matplotlib.pyplot as plt
import seaborn as sns
="smoker",y="total_bill",data=tips,kind="point",ci=None)
sns.catplot(x plt.show()
Customizing Seaborn Plots
Changing plot style and color
# Figure style: "whitegrid"
"whitegrid")
sns.set_style(="age",y="masculinity_important",data=masculinity_data,hue="feel_masculine",kind="point")
sns.catplot(x
plt.show()
# Other styles:
"ticks")
sns.set_style("dark")
sns.set_style("darkgrid")
sns.set_style(
# Example (diverging palette)
"RdBu")
sns.set_palette(= ["No answer","Not at all","Not very","Somewhat","Very"]
category_order ="how_masculine",data=masculinity_data,kind="count",order=category_order)
sns.catplot(x
plt.show()
# Custom palettes
= ["red", "green", "orange", "blue","yellow", "purple"]
custom_palette
sns.set_palette(custom_palette)
# Custom palettes
= ['#FBB4AE', '#B3CDE3', '#CCEBC5','#DECBE4', '#FED9A6', '#FFFFCC','#E5D8BD', '#FDDAEC', '#F2F2F2']
custom_palette
sns.set_palette(custom_palette)
# Larger context: "talk"
#Smallest to largest: "paper", "notebook", "talk", "poster"
"talk")
sns.set_context(
Adding titles and labels: Part 1
# Adding a title to FacetGrid
= sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
g "New Title")
g.fig.suptitle(
plt.show()
# Adjusting height of title in FacetGrid
= sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
g "New Title",y=1.03)
g.fig.suptitle( plt.show()
Adding titles and labels: Part 2
# Adding a title to AxesSubplot
= sns.boxplot(x="Region",y="Birthrate",data=gdp_data)
g "New Title",y=1.03)
g.set_title(
# Titles for subplots
= sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box",col="Group")
g "New Title",y=1.03)
g.fig.suptitle("This is {col_name}")
g.set_titles(
# Adding axis labels
= sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
g set(xlabel="New X Label",ylabel="New Y Label")
g.
plt.show()
# Rotating x-axis tick labels
= sns.catplot(x="Region",y="Birthrate",data=gdp_data,kind="box")
g =90)
plt.xticks(rotation plt.show()
Intermediate Data Visualization with Seaborn
Seaborn Introduction
Introduction to Seaborn
# Seaborn distplot
import seaborn as sns
'alcohol']) sns.distplot(df[
Using the distribution plot
# Creating a histogram
'alcohol'], kde=False, bins=10)
sns.distplot(df[
# Alternative data distributions
'alcohol'], hist=False, rug=True)
sns.distplot(df[
# Further Customizations
'alcohol'], hist=False,rug=True, kde_kws={'shade':True}) sns.distplot(df[
Regression Plots in Seaborn
# Introduction to regplot
="alcohol", y="pH", data=df)
sns.regplot(x
# lmplot faceting
="quality", y="alcohol",data=df, hue="type")
sns.lmplot(x="quality", y="alcohol",data=df, col="type") sns.lmplot(x
Customizing Seaborn Plots
Using Seaborn Styles
# Setting Styles
# Seaborn has default configurations that can be applied with sns.set()
# These styles can override matplotlib and pandas plots as well
set()
sns.
# Theme examples with sns.set_style()
for style in ['white','dark','whitegrid','darkgrid','ticks']:
sns.set_style(style)'Tuition'])
sns.distplot(df[
plt.show()
# Removing axes with despine()
'white')
sns.set_style('Tuition'])
sns.distplot(df[=True) sns.despine(left
Colors in Seaborn
# Defining a color for a plot
set(color_codes=True)
sns.'Tuition'], color='g')
sns.distplot(df[
# Palettes
for p in sns.palettes.SEABORN_PALETTES:
sns.set_palette(p)'Tuition'])
sns.distplot(df[
# Displaying Palettes
for p in sns.palettes.SEABORN_PALETTES:
sns.set_palette(p)
sns.palplot(sns.color_palette())
plt.show()
# Defining Custom Palettes
# Circular colors = when the data is not ordered
"Paired", 12))
sns.palplot(sns.color_palette(
# Sequential colors = when the data has a consistent range from high to low
"Blues", 12))
sns.palplot(sns.color_palette(
# Diverging colors = when both the low and high values are interesting
"BrBG", 12)) sns.palplot(sns.color_palette(
Customizing with matplotlib
# Matplotlib Axes
= plt.subplots()
fig, ax 'Tuition'], ax=ax)
sns.distplot(df[set(xlabel="Tuition 2013-14")
ax.
# Further Customizations
= plt.subplots()
fig, ax 'Tuition'], ax=ax)
sns.distplot(df[set(xlabel="Tuition 2013-14",ylabel="Distribution", xlim=(0, 50000),title="2013-14 Tuition and Fees Distribution")
ax.
# Combining Plots
= plt.subplots(nrows=1,ncols=2, sharey=True, figsize=(7,4))
fig, (ax0, ax1) 'Tuition'], ax=ax0)
sns.distplot(df['State == "MN"')['Tuition'], ax=ax1)
sns.distplot(df.query(set(xlabel="Tuition (MN)", xlim=(0, 70000))
ax1.=20000, label='My Budget', linestyle='--')
ax1.axvline(x ax1.legend()
Additional Plot Types
Categorical Plot Types
# Plots of each observation - stripplot
=df, y="DRG Definition",
sns.stripplot(data="Average Covered Charges",
x=True)
jitter
# Plots of each observation - swarmplot
=df, y="DRG Definition",
sns.swarmplot(data="Average Covered Charges")
x
# Abstract representations - boxplot
=df, y="DRG Definition",
sns.boxplot(data="Average Covered Charges")
x
# Abstract representation - violinplot
=df, y="DRG Definition",
sns.violinplot(data="Average Covered Charges")
x
# Abstract representation - lvplot
=df, y="DRG Definition",
sns.lvplot(data="Average Covered Charges")
x
# Statistical estimates - barplot
=df, y="DRG Definition",
sns.barplot(data="Average Covered Charges",
x="Region")
hue
# Statistical estimates - pointplot
=df, y="DRG Definition",
sns.pointplot(data="Average Covered Charges",
x="Region")
hue
# Statistical estimates - countplot
=df, y="DRG_Code", hue="Region") sns.countplot(data
Regression Plots
# Plotting with regplot()
=df, x='temp',
sns.regplot(data='total_rentals', marker='+')
y
# Evaluating regression with residplot()
=df, x='temp', y='total_rentals')
sns.residplot(data
# Polynomial regression
=df, x='temp',
sns.regplot(data='total_rentals', order=2)
y
# residplot with polynomial regression
=df, x='temp',
sns.residplot(data='total_rentals', order=2)
y
# Categorical values
=df, x='mnth', y='total_rentals',
sns.regplot(data=.1, order=2)
x_jitter
# Estimators
=df, x='mnth', y='total_rentals',
sns.regplot(data=np.mean, order=2)
x_estimator
# Binning the data
=df,x='temp',y='total_rentals',
sns.regplot(data=4) x_bins
Matrix plots
# Getting data in the right format
"mnth"], df["weekday"],
pd.crosstab(df[=df["total_rentals"],aggfunc='mean').round(0)
values
# Build a heatmap
"mnth"], df["weekday"],
sns.heatmap(pd.crosstab(df[=df["total_rentals"], aggfunc='mean')
values
)
# Customize a heatmap
=True, fmt="d",
sns.heatmap(df_crosstab, annot="YlGnBu", cbar=False, linewidths=.5)
cmap
# Centering a heatmap
=True, fmt="d",
sns.heatmap(df_crosstab, annot="YlGnBu", cbar=True,
cmap=df_crosstab.loc[9, 6])
center
# Plotting a correlation matrix
sns.heatmap(df.corr())
Creating Plots on Data Aware Grids
Using FacetGrid, factorplot and lmplot
# FacetGrid Categorical Example
= sns.FacetGrid(df, col="HIGHDEG")
g map(sns.boxplot, 'Tuition',
g.=['1', '2', '3', '4'])
order
# factorplot()
="Tuition", data=df,
sns.factorplot(x="HIGHDEG", kind='box')
col
# FacetGrid for regression
# FacetGrid() can also be used for sca er or regression plots
= sns.FacetGrid(df, col="HIGHDEG")
g map(plt.scatter, 'Tuition', 'SAT_AVG_ALL')
g.
# lmplot
# lmplot plots sca er and regression plots on a FacetGrid
=df, x="Tuition", y="SAT_AVG_ALL",
sns.lmplot(data="HIGHDEG", fit_reg=False)
col
# lmplot with regression
=df, x="Tuition", y="SAT_AVG_ALL",
sns.lmplot(data="HIGHDEG", row='REGION') col
Using PairGrid and pairplot
# Creating a PairGrid
= sns.PairGrid(df, vars=["Fair_Mrkt_Rent", "Median_Income"])
g = g.map(plt.scatter)
g
# Customizing the PairGrid diagonals
= sns.PairGrid(df, vars=["Fair_Mrkt_Rent", "Median_Income"])
g = g.map_diag(plt.hist)
g = g.map_offdiag(plt.scatter)
g
# Pairplot
vars=["Fair_Mrkt_Rent", "Median_Income"], kind='reg', diag_kind='hist')
sns.pairplot(df,
# Customizing a pairplot
'BEDRMS < 3'),vars=["Fair_Mrkt_Rent","Median_Income", "UTILITY"],hue='BEDRMS', palette='husl', plot_kws={'alpha': 0.5}) sns.pairplot(df.query(
Using JointGrid and jointplot
# Basic JointGrid
= sns.JointGrid(data=df, x="Tuition",y="ADM_RATE_ALL")
g
g.plot(sns.regplot, sns.distplot)
# Advanced JointGrid
= sns.JointGrid(data=df, x="Tuition",y="ADM_RATE_ALL")
g = g.plot_joint(sns.kdeplot)
g = g.plot_marginals(sns.kdeplot, shade=True)
g = g.annotate(stats.pearsonr)
g
# jointplot()
=df, x="Tuition",y="ADM_RATE_ALL", kind='hex')
sns.jointplot(data
# Customizing a jointplot
= (sns.jointplot(x="Tuition",
g ="ADM_RATE_ALL", kind='scatter',
y=(0, 25000),
xlim=dict(bins=15,rug=True),
marginal_kws=df.query('UG < 2500 & Ownership == "Public"'))
data .plot_joint(sns.kdeplot))