Introduction to Data Visualization with Matplotlib

matplotlib cheatsheet in pdf

pdf lecture in github

Introduction to Matplotlib

Introduction to data visualization with Matplotlib

# Introducing the pyplot interface
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
plt.show()

# Adding data to axes
ax.plot(seattle_weather["MONTH"], seattle_weather["MLY-TAVG-NORMAL"])
plt.show()

Customizing your plots

# Adding markers
ax.plot(seattle_weather["MONTH"],
seattle_weather["MLY-PRCP-NORMAL"],
marker="o")
plt.show()

# Choosing markers
ax.plot(seattle_weather["MONTH"],
seattle_weather["MLY-PRCP-NORMAL"],
marker="v")
plt.show()

markers

# Setting the linestyle
fig, ax = plt.subplots()
ax.plot(seattle_weather["MONTH"],
seattle_weather["MLY-TAVG-NORMAL"],
marker="v", linestyle="--")
plt.show()

line style

# Eliminating lines with linestyle
fig, ax = plt.subplots()
ax.plot(seattle_weather["MONTH"],
seattle_weather["MLY-TAVG-NORMAL"],
marker="v", linestyle="None")
plt.show()

# Choosing color
fig, ax = plt.subplots()
ax.plot(seattle_weather["MONTH"],
seattle_weather["MLY-TAVG-NORMAL"],
marker="v", linestyle="--", color="r")
plt.show()

# Customizing the axes labels
ax.set_xlabel("Time (months)")
plt.show()

# Setting the y axis label
ax.set_xlabel("Time (months)")
ax.set_ylabel("Average temperature (Fahrenheit degrees)")
plt.show()

# Adding a title
ax.set_title("Weather in Seattle")
plt.show()

Small multiples

# Small multiples with plt.subplots
fig, ax = plt.subplots(3, 2)
plt.show()

# Adding data to subplots
ax.shape
(3, 2)
ax[0, 0].plot(seattle_weather["MONTH"],seattle_weather["MLY-PRCP-NORMAL"],color='b')
plt.show()

# Subplots with data
fig, ax = plt.subplots(2, 1)
ax[0].plot(seattle_weather["MONTH"], seattle_weather["MLY-PRCP-NORMAL"],color='b')
ax[0].plot(seattle_weather["MONTH"], seattle_weather["MLY-PRCP-25PCTL"],linestyle='--', color='b')
ax[0].plot(seattle_weather["MONTH"], seattle_weather["MLY-PRCP-75PCTL"],linestyle='--', color='b')
ax[1].plot(austin_weather["MONTH"], austin_weather["MLY-PRCP-NORMAL"],color='r')
ax[1].plot(austin_weather["MONTH"], austin_weather["MLY-PRCP-25PCTL"],linestyle='--', color='r')
ax[1].plot(austin_weather["MONTH"], austin_weather["MLY-PRCP-75PCTL"],linestyle='--', color='r')
ax[0].set_ylabel("Precipitation (inches)")
ax[1].set_ylabel("Precipitation (inches)")
ax[1].set_xlabel("Time (months)")
plt.show()

# Sharing the y-axis range
fig, ax = plt.subplots(2, 1, sharey=True)

Plotting time-series

Plotting time-series data

# DateTimeIndex
climate_change.index
DatetimeIndex(['1958-03-06', '1958-04-06', '1958-05-06', '1958-06-06',
     dtype='datetime64[ns]', name='date', length=706, freq=None)


# Plotting time-series data
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(climate_change.index, climate_change['co2'])
ax.set_xlabel('Time')
ax.set_ylabel('CO2 (ppm)')
plt.show()

# Zooming in on a decade
sixties = climate_change["1960-01-01":"1969-12-31"]
fig, ax = plt.subplots()
ax.plot(sixties.index, sixties['co2'])
ax.set_xlabel('Time')
ax.set_ylabel('CO2 (ppm)')
plt.show()

# Zooming in on one year
sixty_nine = climate_change["1969-01-01":"1969-12-31"]
fig, ax = plt.subplots()
ax.plot(sixty_nine.index, sixty_nine['co2'])
ax.set_xlabel('Time')
ax.set_ylabel('CO2 (ppm)')
plt.show()

Plotting time-series with different variables

# Plotting two time-series together
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(climate_change.index, climate_change["co2"])
ax.plot(climate_change.index, climate_change["relative_temp"])
ax.set_xlabel('Time')
ax.set_ylabel('CO2 (ppm) / Relative temperature')
plt.show()

# Using twin axes
fig, ax = plt.subplots()
ax.plot(climate_change.index, climate_change["co2"])
ax.set_xlabel('Time')
ax.set_ylabel('CO2 (ppm)')
ax2 = ax.twinx()
ax2.plot(climate_change.index, climate_change["relative_temp"])
ax2.set_ylabel('Relative temperature (Celsius)')
plt.show()

# Separating variables by color
fig, ax = plt.subplots()
ax.plot(climate_change.index, climate_change["co2"], color='blue')
ax.set_xlabel('Time')
ax.set_ylabel('CO2 (ppm)', color='blue')
ax2 = ax.twinx()
ax2.plot(climate_change.index, climate_change["relative_temp"],
color='red')
ax2.set_ylabel('Relative temperature (Celsius)', color='red')
plt.show()

# Coloring the ticks
fig, ax = plt.subplots()
ax.plot(climate_change.index, climate_change["co2"],
color='blue')
ax.set_xlabel('Time')
ax.set_ylabel('CO2 (ppm)', color='blue')
ax.tick_params('y', colors='blue')
ax2 = ax.twinx()
ax2.plot(climate_change.index,
climate_change["relative_temp"],
color='red')
ax2.set_ylabel('Relative temperature (Celsius)',
color='red')
ax2.tick_params('y', colors='red')
plt.show()

# A function that plots time-series
def plot_timeseries(axes, x, y, color, xlabel, ylabel):
    axes.plot(x, y, color=color)
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel, color=color)
    axes.tick_params('y', colors=color)
# Using our function
fig, ax = plt.subplots()
plot_timeseries(ax, climate_change.index, climate_change['co2'],'blue', 'Time', 'CO2 (ppm)')
ax2 = ax.twinx()
plot_timeseries(ax, climate_change.index,climate_change['relative_temp'],'red', 'Time', 'Relative temperature (Celsius)')
plt.show()

Annotating time-series data

# Annotation
fig, ax = plt.subplots()
plot_timeseries(ax, climate_change.index, climate_change['co2'],
'blue', 'Time', 'CO2 (ppm)')
ax2 = ax.twinx()
plot_timeseries(ax2, climate_change.index,
climate_change['relative_temp'],
'red', 'Time', 'Relative temperature (Celsius)')
ax2.annotate(">1 degree",
xy=[pd.TimeStamp("2015-10-06"), 1])
plt.show()

# Positioning the text
ax2.annotate(">1 degree",
xy=(pd.Timestamp('2015-10-06'), 1),
xytext=(pd.Timestamp('2008-10-06'), -0.2))

# Adding arrows to annotation
ax2.annotate(">1 degree",
xy=(pd.Timestamp('2015-10-06'), 1),
xytext=(pd.Timestamp('2008-10-06'), -0.2),
arrowprops={})

# Customizing arrow properties
ax2.annotate(">1 degree",
xy=(pd.Timestamp('2015-10-06'), 1),
xytext=(pd.Timestamp('2008-10-06'), -0.2),
arrowprops={"arrowstyle":"->", "color":"gray"})

Customizing annotations

Quantitative comparisons and statistical visualizations

Quantitative comparisons: bar-charts

# Olympic medals: visualizing the data
medals = pd.read_csv('medals_by_country_2016.csv', index_col=0)
fig, ax = plt.subplots()
ax.bar(medals.index, medals["Gold"])
plt.show()

# Interlude: rotate the tick labels
fig, ax = plt.subplots()
ax.bar(medals.index, medals["Gold"])
ax.set_xticklabels(medals.index, rotation=90)
ax.set_ylabel("Number of medals")
plt.show()

# Olympic medals: visualizing the other medals : stacked bar chart
fig, ax = plt.subplots
ax.bar(medals.index, medals["Gold"])
ax.bar(medals.index, medals["Silver"], bottom=medals["Gold"])
ax.set_xticklabels(medals.index, rotation=90)
ax.set_ylabel("Number of medals")
plt.show()

# Olympic medals: visualizing all three
fig, ax = plt.subplots
ax.bar(medals.index, medals["Gold"])
ax.bar(medals.index, medals["Silver"], bottom=medals["Gold"])
ax.bar(medals.index, medals["Bronze"],
bottom=medals["Gold"] + medals["Silver"])
ax.set_xticklabels(medals.index, rotation=90)
ax.set_ylabel("Number of medals")
plt.show()

# Adding a legend
fig, ax = plt.subplots
ax.bar(medals.index, medals["Gold"], label="Gold")
ax.bar(medals.index, medals["Silver"], bottom=medals["Gold"],
label="Silver")
ax.bar(medals.index, medals["Bronze"],
bottom=medals["Gold"] + medals["Silver"],
label="Bronze")
ax.set_xticklabels(medals.index, rotation=90)
ax.set_ylabel("Number of medals")
ax.legend()
plt.show()

Quantitative comparisons: histograms

# Introducing histograms
fig, ax = plt.subplots()
ax.hist(mens_rowing["Height"])
ax.hist(mens_gymnastic["Height"])
ax.set_xlabel("Height (cm)")
ax.set_ylabel("# of observations")
plt.show()

# Labels are needed
ax.hist(mens_rowing["Height"], label="Rowing")
ax.hist(mens_gymnastic["Height"], label="Gymnastics")
ax.set_xlabel("Height (cm)")
ax.set_ylabel("# of observations")
ax.legend()
plt.show()

# Customizing histograms: setting the number of bins
ax.hist(mens_rowing["Height"], label="Rowing", bins=5)
ax.hist(mens_gymnastic["Height"], label="Gymnastics", bins=5)
ax.set_xlabel("Height (cm)")
ax.set_ylabel("# of observations")
ax.legend()
plt.show()

# Customizing histograms: setting bin boundaries
ax.hist(mens_rowing["Height"], label="Rowing",
bins=[150, 160, 170, 180, 190, 200, 210])
ax.hist(mens_gymnastic["Height"], label="Gymnastics",
bins=[150, 160, 170, 180, 190, 200, 210])
ax.set_xlabel("Height (cm)")
ax.set_ylabel("# of observations")
ax.legend()
plt.show()

# Customizing histograms: transparency
ax.hist(mens_rowing["Height"], label="Rowing",
bins=[150, 160, 170, 180, 190, 200, 210],
histtype="step")
ax.hist(mens_gymnastic["Height"], label="Gymnastics",
bins=[150, 160, 170, 180, 190, 200, 210],
histtype="step")
ax.set_xlabel("Height (cm)")
ax.set_ylabel("# of observations")
ax.legend()
plt.show()

Statistical plotting

# Adding error bars to bar charts
fig, ax = plt.subplots()
ax.bar("Rowing",mens_rowing["Height"].mean(),
yerr=mens_rowing["Height"].std())
ax.bar("Gymnastics",mens_gymnastics["Height"].mean(),
yerr=mens_gymnastics["Height"].std())
ax.set_ylabel("Height (cm)")
plt.show()

# Adding error bars to plots
fig, ax = plt.subplots()
ax.errorbar(seattle_weather["MONTH"],
seattle_weather["MLY-TAVG-NORMAL"],
yerr=seattle_weather["MLY-TAVG-STDDEV"])

ax.errorbar(austin_weather["MONTH"],
austin_weather["MLY-TAVG-NORMAL"],
yerr=austin_weather["MLY-TAVG-STDDEV"])

ax.set_ylabel("Temperature (Fahrenheit)")
plt.show()

# Adding boxplots
fig, ax = plt.subplots()
ax.boxplot([mens_rowing["Height"],
mens_gymnastics["Height"]])
ax.set_xticklabels(["Rowing", "Gymnastics"])
ax.set_ylabel("Height (cm)")
plt.show()

Quantitative comparisons: scatter plots

# Introducing scatter plots
fig, ax = plt.subplots()
ax.scatter(climate_change["co2"], climate_change["relative_temp"])
ax.set_xlabel("CO2 (ppm)")
ax.set_ylabel("Relative temperature (Celsius)")
plt.show()

# Customizing scatter plots
eighties = climate_change["1980-01-01":"1989-12-31"]
nineties = climate_change["1990-01-01":"1999-12-31"]
fig, ax = plt.subplots()
ax.scatter(eighties["co2"], eighties["relative_temp"],
color="red", label="eighties")
ax.scatter(nineties["co2"], nineties["relative_temp"],
color="blue", label="nineties")
ax.legend()
ax.set_xlabel("CO2 (ppm)")
ax.set_ylabel("Relative temperature (Celsius)")
plt.show()

# Encoding a third variable by color
fig, ax = plt.subplots()
ax.scatter(climate_change["co2"], climate_change["relative_temp"],
c=climate_change.index)
ax.set_xlabel("CO2 (ppm)")
ax.set_ylabel("Relative temperature (Celsius)")
plt.show()

Sharing visualizations with others

Preparing your figures to share with others

# Choosing a style
plt.style.use("ggplot")
fig, ax = plt.subplots()
ax.plot(seattle_weather["MONTH"], seattle_weather["MLY-TAVG-NORMAL"
ax.plot(austin_weather["MONTH"], austin_weather["MLY-TAVG-NORMAL"])
ax.set_xlabel("Time (months)")
ax.set_ylabel("Average temperature (Fahrenheit degrees)")
plt.show()

# Back to the default
plt.style.use("default")

available styles

# The "bmh" style
plt.style.use("bmh")
fig, ax = plt.subplots()
ax.plot(seattle_weather["MONTH"], seattle_weather["MLY-TAVG-NORMAL"
ax.plot(austin_weather["MONTH"], austin_weather["MLY-TAVG-NORMAL"])
ax.set_xlabel("Time (months)")
ax.set_ylabel("Average temperature (Fahrenheit degrees)")
plt.show()

# Seaborn styles
plt.style.use("seaborn-colorblind")
fig, ax = plt.subplots()
ax.plot(seattle_weather["MONTH"], seattle_weather["MLY-TAVG-NORMAL"
ax.plot(austin_weather["MONTH"], austin_weather["MLY-TAVG-NORMAL"])
ax.set_xlabel("Time (months)")
ax.set_ylabel("Average temperature (Fahrenheit degrees)")
plt.show()

Saving your visualizations

# Saving the figure to file
fig, ax = plt.subplots()
ax.bar(medals.index, medals["Gold"])
ax.set_xticklabels(medals.index, rotation=90)
ax.set_ylabel("Number of medals")
fig.savefig("gold_medals.png")

# Different file formats
fig.savefig("gold_medals.jpg")
fig.savefig("gold_medals.jpg", quality=50)
fig.savefig("gold_medals.svg")

# Resolution
fig.savefig("gold_medals.png", dpi=300)

# Size
fig.set_size_inches([5, 3])

# Another aspect ratio
fig.set_size_inches([3, 5])

Automating figures from data

# Getting unique values of a column
sports = summer_2016_medals["Sport"].unique()

# Bar-chart of heights for all sports
fig, ax = plt.subplots()
for sport in sports:
sport_df = summer_2016_medals[summer_2016_medals["Sport"] == spor
ax.bar(sport, sport_df["Height"].mean(),
yerr=sport_df["Height"].std())
ax.set_ylabel("Height (cm)")
ax.set_xticklabels(sports, rotation=90)
plt.show()