#!/usr/bin/env python # coding: utf-8 # # USA Births Data # # *By Jake VanderPlas. See http://github.com/jakevdp/data-CDCbirths/* # # This dataset records birth rates in the USA by year. It was compiled from data on the [CDC website](http://www.cdc.gov/nchs/data_access/Vitalstatsonline.htm) and is aggregated so as to comply with their terms of use. # # *(This was inspired by Andrew Gelman's recent [NYC R Meetup](https://www.youtube.com/watch?v=ObS1hkOxyPA&feature=youtu.be) talk. Some of his writeup on the topic can be seen [on his blog](http://andrewgelman.com/2012/06/12/simple-graph-win-the-example-of-birthday-frequencies/))* # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib as mpl import matplotlib.pyplot as plt import seaborn; seaborn.set() import pandas as pd import numpy as np # In[2]: births = pd.read_csv('births.csv') births.head() # In[3]: # Some data is mis-reported; e.g. June 31st, etc. # remove these outliers via robust sigma-clipping quartiles = np.percentile(births['births'], [25, 50, 75]) mu = quartiles[1] sig = 0.7413 * (quartiles[2] - quartiles[0]) births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)') # set 'day' column to integer; it originally was a string due to nulls births['day'] = births['day'].astype(int) # In[4]: # Choose a leap year to display births by date dates = [pd.datetime(2012, month, day) for (month, day) in zip(births['month'], births['day'])] # In[5]: # Plot the results fig, ax = plt.subplots(figsize=(8, 6)) births.groupby(dates)['births'].mean().plot(ax=ax) # Label the plot ax.text('2012-1-1', 3950, "New Year's Day") ax.text('2012-7-4', 4250, "Independence Day", ha='center') ax.text('2012-9-4', 4850, "Labor Day", ha='center') ax.text('2012-10-31', 4600, "Halloween", ha='right') ax.text('2012-11-25', 4450, "Thanksgiving", ha='center') ax.text('2012-12-25', 3800, "Christmas", ha='right') ax.set(title='USA births by day of year (1969-1988)', ylabel='average daily births', xlim=('2011-12-20','2013-1-10'), ylim=(3700, 5400)); # Format the x axis with centered month labels ax.xaxis.set_major_locator(mpl.dates.MonthLocator()) ax.xaxis.set_minor_locator(mpl.dates.MonthLocator(bymonthday=15)) ax.xaxis.set_major_formatter(plt.NullFormatter()) ax.xaxis.set_minor_formatter(mpl.dates.DateFormatter('%h')); # Apparently American women *really* love their holidays!