import pandas as pd, seaborn as sns, numpy as np
sns.set(font='Bitstream Vera Sans')
sns.set_context('poster', {'figsize':(12,8)})

pd.options.display.max_rows = 8

%matplotlib inline

%cd ~/2014_fall_ASTR599/notebooks/
%run talktools.py
%cd ~/notebook/


import pandas as pd

import pandas as pd  # we did this already!
# did it work?

pd.__version__

pd.DataFrame([shift-tab]

# loading a csv
df = pd.read_csv(fname)

# loading an excel file
df = pd.read_[tab]

# loading a stata file
df = pd.read_

df = pd.DataFrame({'a': [10,20,30],
                   'b': [40,50,60]})

df

# think of DataFrames as numpy arrays plus
df.columns
df.index
df.b.index is df.index

url = 'http://ghdx.healthdata.org/sites/default/files/'\
      'record-attached-files/IHME_PHMRC_VA_DATA_ADULT_Y2013M09D11_0.csv'

df = pd.read_csv(url)

df = pd.read_csv(url, low_memory=False)

df

# also load codebook (excel doc)
url = 'http://ghdx.healthdata.org/sites/default/files/'\
      'record-attached-files/IHME_PHMRC_VA_DATA_CODEBOOK_Y2013M09D11_0.xlsx'
cb = pd.read_excel(url)

cb.head()

# each column of pd.DataFrame is a pd.Series
cb.module

# can uses square-brackets instead of "dot"
# (useful if column name has spaces!)
cb.iloc[3, 3]

# what's in this series?
cb.module.value_counts()

# accessing individual cells
df.head()

# to access by row and column
# can use names
df.loc[4, 'gs_text34']

# or use numbers
df.iloc[4, 3]

# same because columns 3 is gs_text
# how to check?
df.columns[3]

df.head()

# make the row names more interesting than numbers starting from zero
df.index = ['person %d'%(i+1) for i in df.index]

# have a look at the first ten rows and first five columns
df.iloc[:10, :5]

# slicing with named indices is possible, too
df.loc['person 10', 'site':'gs_text34']

# logical operations (element-wise)
df.module == "Adult"

# usefule for selecting subset of rows
dfa = df[df.module == "Adult"]

# summarize
dfa.va34.describe(percentiles=[.025,.975])

# percent of occurrences
100 * dfa.gs_text34.value_counts(normalize=True).round(3)

# calculate average
dfa.word_fever.mean()

# create visual summaries
dfa.site.value_counts(ascending=True).plot(kind='barh', figsize=(12,8))

df.groupby('gs_text34')

df.word_activ

df.groupby('gs_text34').word_fever.mean()

(dfa.groupby('gs_text34').word_activ.mean() * 100).order().plot(kind='barh', figsize=(12,12))

df.filter(like='word').head()

# get a smaller table to experiment with
t = df.filter(like='word').describe()
t

# transpose table
t.T.head()

# convert from "wide" to "long"
t_long = t.T.stack()
t_long

# convert back
t_long.unstack()

# challenge: find the most common word

df.head()

piv = pd.pivot_table(df, values='word_fever',
                     index=['site'], columns=['gs_text34'],
                     aggfunc=sum)
piv

causes = list(piv.columns)
piv['site'] = piv.index
fever_cnts = pd.melt(piv, id_vars=['site'], value_vars=causes,
        var_name='cause of death',
        value_name='number w fever mentioned')
fever_cnts.head()

# cleaner to drop the rows with NaNs
fever_cnts.dropna().head()

# put together first three and last three rows of df
df.iloc[:3].append(df.iloc[-3:])

df.iloc[:5,20:25]

cb.iloc[10:15]

pd.merge(

merged_df = pd.merge(df.T, cb, left_index=True, right_on='variable')
merged_df.filter(['variable', 'question', 'person 1', 'person 2']).dropna().head(10)

df.groupby('gs_text34').word_fever.mean()

for g, dfg in df.groupby('gs_text34'):
    print g
    # process DataFrame for this group
    break

# apply

def my_func(row):
    return row['word_xray'] > 0

my_func(df.iloc[0])

df.apply(my_func, axis=1)

rng = pd.date_range('1/1/2012', periods=1000, freq='S')

rng

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts.resample('5Min', how='sum')

# in here:
cb.head()

# hint: look here
cb.question.str

var = 

# create a new column with 0/1 values instead of Yes/No/Don't Know/Refused to Answer
df['fever'] = df[var] == 'Yes'

# use groupby like we did in class

import matplotlib.pyplot as plt
# do some plotting like we did in class
plt.xlabel('Endorsement Rate (%)')

# this might be familiar now...

# could be good to start by making some numeric columns like above

# groupby is a way to go, but not the only way

# note: there are lots of ways to do this
# try to find one that is so simple, you will understand it next time you look

# some interesting differences. if you want to know why, you might be a social scientist...