import pandas as pd, seaborn as sns, numpy as np sns.set(font='Bitstream Vera Sans') sns.set_context('poster', {'figsize':(12,8)}) pd.options.display.max_rows = 8 %matplotlib inline %cd ~/2014_fall_ASTR599/notebooks/ %run talktools.py %cd ~/notebook/ import pandas as pd import pandas as pd # we did this already! # did it work? pd.__version__ pd.DataFrame([shift-tab] # loading a csv df = pd.read_csv(fname) # loading an excel file df = pd.read_[tab] # loading a stata file df = pd.read_ df = pd.DataFrame({'a': [10,20,30], 'b': [40,50,60]}) df # think of DataFrames as numpy arrays plus df.columns df.index df.b.index is df.index url = 'http://ghdx.healthdata.org/sites/default/files/'\ 'record-attached-files/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv' df = pd.read_csv(url) df = pd.read_csv(url, low_memory=False) df # also load codebook (excel doc) url = 'http://ghdx.healthdata.org/sites/default/files/'\ 'record-attached-files/IHME_PHMRC_VA_DATA_CODEBOOK_Y2013M09D11_0.xlsx' cb = pd.read_excel(url) cb.head() # each column of pd.DataFrame is a pd.Series cb.module # can uses square-brackets instead of "dot" # (useful if column name has spaces!) cb.iloc[3, 3] # what's in this series? cb.module.value_counts() # accessing individual cells df.head() # to access by row and column # can use names df.loc[4, 'gs_text34'] # or use numbers df.iloc[4, 3] # same because columns 3 is gs_text # how to check? df.columns[3] df.head() # make the row names more interesting than numbers starting from zero df.index = ['person %d'%(i+1) for i in df.index] # have a look at the first ten rows and first five columns df.iloc[:10, :5] # slicing with named indices is possible, too df.loc['person 10', 'site':'gs_text34'] # logical operations (element-wise) df.module == "Adult" # usefule for selecting subset of rows dfa = df[df.module == "Adult"] # summarize dfa.va34.describe(percentiles=[.025,.975]) # percent of occurrences 100 * dfa.gs_text34.value_counts(normalize=True).round(3) # calculate average dfa.word_fever.mean() # create visual summaries dfa.site.value_counts(ascending=True).plot(kind='barh', figsize=(12,8)) df.groupby('gs_text34') df.word_activ df.groupby('gs_text34').word_fever.mean() (dfa.groupby('gs_text34').word_activ.mean() * 100).order().plot(kind='barh', figsize=(12,12)) df.filter(like='word').head() # get a smaller table to experiment with t = df.filter(like='word').describe() t # transpose table t.T.head() # convert from "wide" to "long" t_long = t.T.stack() t_long # convert back t_long.unstack() # challenge: find the most common word df.head() piv = pd.pivot_table(df, values='word_fever', index=['site'], columns=['gs_text34'], aggfunc=sum) piv causes = list(piv.columns) piv['site'] = piv.index fever_cnts = pd.melt(piv, id_vars=['site'], value_vars=causes, var_name='cause of death', value_name='number w fever mentioned') fever_cnts.head() # cleaner to drop the rows with NaNs fever_cnts.dropna().head() # put together first three and last three rows of df df.iloc[:3].append(df.iloc[-3:]) df.iloc[:5,20:25] cb.iloc[10:15] pd.merge( merged_df = pd.merge(df.T, cb, left_index=True, right_on='variable') merged_df.filter(['variable', 'question', 'person 1', 'person 2']).dropna().head(10) df.groupby('gs_text34').word_fever.mean() for g, dfg in df.groupby('gs_text34'): print g # process DataFrame for this group break # apply def my_func(row): return row['word_xray'] > 0 my_func(df.iloc[0]) df.apply(my_func, axis=1) rng = pd.date_range('1/1/2012', periods=1000, freq='S') rng ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts.resample('5Min', how='sum') # in here: cb.head() # hint: look here cb.question.str var = # create a new column with 0/1 values instead of Yes/No/Don't Know/Refused to Answer df['fever'] = df[var] == 'Yes' # use groupby like we did in class import matplotlib.pyplot as plt # do some plotting like we did in class plt.xlabel('Endorsement Rate (%)') # this might be familiar now... # could be good to start by making some numeric columns like above # groupby is a way to go, but not the only way # note: there are lots of ways to do this # try to find one that is so simple, you will understand it next time you look # some interesting differences. if you want to know why, you might be a social scientist...