#!/usr/bin/env python # coding: utf-8 # # `regex` workflow # In[6]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import re # ### Jeremy Howard is the guest lecturer for Lesson 9!
# # #### In the video, he gives a three-part lesson plan: # * regex workflow # * svd # * transfer learning. # # Jeremy mentions that he uses `regex` every day in his work, and that it is essential for machine learning practitioners to develop a working knowledge of `regex`. Since we've already done deep dives into `svd` and into `transfer learning`, we'll focus on the `regex` part of this video, `from 1:50 to 21:29`. # ### A simple `regex` exercise # #### To illustrate the power of `regex` and familiarize us with the way he works, Jeremy poses the following problem:
Let's extract all the phone numbers from the Austin Public Health Locations database and create a list of the phone numbers in the standard format `(ddd) ddd dddd`. He shows how to use `vim` to accomplish this task. # Let's listen to Jeremy for the next 20 minutes or so: # In[53]: from IPython.display import HTML # Play youtube video HTML('') # #### Some of the takeaways from the video, paraphrased: # 1. A neccessary but not sufficient condition for success
# What is the greatest difference between people who succeed and people who do not? It's entirely about tenacity. If you are willing to focus on the task and keep trying you have a good chance of succeeding. # # 2. Workflow
# Work in an interactive environment, such as `vim`, or `jupyter notebook`, so you can try things get immediate feedback, revise, and progress toward a solution. # # 3. Debugging
# When your code fails, remember that the computer is doing exactly what you asked. A good general approach is to break the code up into smaller parts, then run it again, and find out which part doesn't work. # # 4. Humility
# It's never "I think the problem in the code is X". A better approach is to start with the working assumption "I am an idiot, and I don't understand why things aren't working". Be willing to start from scratch and check every little step. # #### OK, let's get to work on our task. We'll use `jupyter notebook` as our interactive environment. # ## 1. Get the Austin Public Health Locations database # #### https://data.austintexas.gov/Health-and-Community-Services/Austin-Public-Health-Locations/6v78-dj3u/data # In[13]: path = 'C:/Users/cross-entropy/.fastai/data/Austin_Public_Health_Locations' # #### Read the data into a pandas dataframe. # From the `Phone Number` column, we see that the phone numbers are in the format `ddd-ddd-dddd`. # In[56]: df = pd.read_csv(path+'/Austin_Public_Health_Locations.csv') display(df) # #### Read the database into a raw text string. # This will be our starting point. # In[54]: with open(path+'/Austin_Public_Health_Locations.csv', 'r') as file: data = file.read().replace('\n', '') print(data) # ## 2. Extract the phone numbers # #### We first construct a regular expression to match the phone numbers and break them into tuples. This involved a bit of trial and error. # In[57]: re_extract_phone_number = re.compile(r"(\d\d\d)-(\d+)-(\d+)") # In[59]: phone_number_list = re_extract_phone_number.findall(data) display(phone_number_list) # ## 3. Put the phone numbers in the desired format # #### Next we join together the tuples, separated by spaces: # In[52]: [' '.join(tuple) for tuple in phone_number_list] # #### Voila! Finis. # In[ ]: