Code

Fuzzy Wuzzy String Matching

  |  | \# — DistrictNameMatching.py  |
|---|---|
|  | \# Author: Anthony Louis D'Agostino (ald2187 \[at\] columbia.edu) |
|  | \# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library |
|  | \# Notes: Default number of matches currently set to 3, though can be modified as input argument. |
|  |  |
|  | import os |
|  | import numpy as np |
|  | from fuzzywuzzy import fuzz |
|  | from fuzzywuzzy import process |
|  | import pandas as pd |
|  |  |
|  |  |
|  |  |
|  | def districtMatch(master, using, master\_dist, master\_state, using\_dist, using\_state, outFile, num\_match = 3): |
|  | """ |
|  |  This function takes two sets of district-state names, and produces a DTA with a set number (default=3)  |
|  |  of matches with a flag for whether the district name has been completely matched.  |
|  |  |
|  |  Manual work is then required for districts where a perfect match has not been made.  |
|  |  |
|  |  master: file containing the master list of districts  |
|  |  using: file containing using list of districts, eg., each of these districts is compared against these |
|  |  universe of master districts from the master file  |
|  |  master\_dist: variable name pertaining to districts in master file |
|  |  master\_state: variable name pertaining to states in master file |
|  |  using\_dist: variable name pertaining to districts in using file  |
|  |  using\_state: variable name pertaining to states in using file  |
|  |  num\_match: number of matches generated, default is 3  |
|  |  outFile: includes path and filename for an outputted DTA file – should be "\*.dta" |
|  |  """ |
|  |  |
|  | master\_dists = pd.read\_csv(master, quotechar = '"', skipinitialspace = True, sep = None) |
|  | print " \*\*\* Now printing column values for master file \*\*\* " |
|  | print list(master\_dists.columns.values) |
|  |  |
|  | using\_dists = pd.read\_csv(using, quotechar = '"', skipinitialspace = True, sep = None) |
|  | print " \*\*\* Now printing column values for using file \*\*\* " |
|  | print list(using\_dists.columns.values) |
|  |  |
|  | \# — concatenate district and state names  |
|  | master\_names = master\_dists\[master\_dist\].map(str) + ", " + master\_dists\[master\_state\] |
|  | using\_names = using\_dists\[using\_dist\].map(str) + ", " + using\_dists\[using\_state\] |
|  |  |
|  | fhp\_new = \[process.extract(x, master\_names, limit=num\_match) for x in using\_names\] |
|  |  |
|  | \# — generate column names  |
|  | lab = "" |
|  | i = 1 |
|  | while i <= num\_match: |
|  | lab = lab + " " + "Match" + str(i) |
|  | i += 1 |
|  |  |
|  |  |
|  |  |
|  | fhp\_matches = pd.DataFrame(fhp\_new, columns = lab.split()) |
|  |  |
|  | d={} |
|  | for x in range(1,num\_match+1): |
|  | d\["Match{0}".format(x)\]=\[y\[0\] for y in fhp\_matches\['Match'+str(x)\]\] |
|  |  |
|  |  |
|  | d\['using\_original'\] = using\_names |
|  |  |
|  |  |
|  | \#match1 = \[x\[0\] for x in fhp\_matches\['Match1'\]\]  |
|  | d\['perfect\_match'\] = d\['Match1'\] == d\['using\_original'\] |
|  |  |
|  | \#fhp\_matches\['perfect\_match'\] = pd.Series(perfect\_match, index = fhp\_matches.index) |
|  | out = pd.DataFrame(d) |
|  | \#out.to\_stata(str(outFile + ".dta")) |
|  | out.to\_csv(str(outFile + ".csv")) |
|  | print "\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*" |
|  | print "\*\*\* Your analysis has been completed! \*\*\* " |
|  | print "\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*" |
|  |  |
|  | return out |
|  |  |
|  |  |
|  | """ |
|  | BASIC FILES/PATHS WHOSE USE IS REPEATED |
|  | """ |
|  |  |
|  |  |
|  | baseDir = os.path.join("<insert path>") |
|  |  |
|  |  |
|  | outDir = os.path.join(baseDir, "Matched\_Results") |
|  |  |
|  | if not os.path.exists(outDir): |
|  | os.makedirs(outDir) |
|  |  |
|  |  |
|  |  |
|  |  |
|  |  |
|  | """ |
|  | ICRISAT and 1971 Polygon borders  |
|  | """ |
|  |  |
|  | master\_file = os.path.join(baseDir, "ICRISAT\_Meso\_Names.csv") |
|  | input\_1971 = os.path.join(baseDir,"District\_Records\_1971\_Clean.csv") |
|  |  |
|  | outFile = os.path.join(outDir, "ICRI\_1971\_DistrictMatches") |
|  | icri\_1971\_match = districtMatch(input\_1971, master\_file, 'NAME', 'STATE\_UT', 'distname', 'stname', outFile) |
|  |  |
|  | \# — alternatively, don't save as a workspace object |
|  | districtMatch(input\_1971, master\_file, 'NAME', 'STATE\_UT', 'distname', 'stname', outFile) |
|  |  |

 [view raw](https://gist.github.com/a8dx/20caedaf942f1810e16994bfdb57e8dc/raw/ec412554285823b873e37d523142082d79cd36d5/DistrictNameMatching.py) [ DistrictNameMatching.py ](https://gist.github.com/a8dx/20caedaf942f1810e16994bfdb57e8dc#file-districtnamematching-py) hosted with ❤ by [GitHub](https://github.com) 

ZIP5 – County Lookup

  |  | /\* |
|---|---|
|  | Filename: ZIP5\_County\_Lookup.do |
|  | Author: Anthony D'Agostino (ald at satanford dot edu)  |
|  | Date Created: 02/09/2018 |
|  | Last Edited: 03/27/2018 |
|  | Purpose: Generate a merged zip5-county crosswalk that leverages the strengths of both the US Census and HUD ZCTA crosswalks.  |
|  |  |
|  | \*/ |
|  |  |
|  |  |
|  | clear |
|  | clear all |
|  | clear matrix |
|  | set more off |
|  | set maxvar 25000, permanently |
|  |  |
|  |  |
|  |  |
|  | loc basePath "<your path here>" |
|  |  |
|  |  |
|  | // read in county codes, downloaded from: https://www.census.gov/geo/reference/codes/cou.html  |
|  | import delimited using "`basePath'/national\_county.txt", delim(",") clear stringcols(\_all) varnames(nonames) |
|  |  |
|  | ren v1 state |
|  | ren v2 state\_fips |
|  | ren v3 county\_fips |
|  | ren v4 county\_name |
|  | ren v5 fipsclasscode |
|  |  |
|  | gen county = state\_fips + county\_fips |
|  |  |
|  | tempfile counties |
|  | save `counties' |
|  |  |
|  |  \*import excel using "`basePath'/ZIP\_COUNTY\_032011.xlsx", firstrow clear  |
|  | import excel using "`basePath'/ZIP\_COUNTY\_122017.xlsx", firstrow clear |
|  |  |
|  | // 39455 unique entries before any transformations |
|  | merge m:1 county using `counties', update replace |
|  | tab \_merge |
|  | drop if \_merge == 2 |
|  | drop \_merge |
|  |  |
|  | gsort zip –tot\_ratio |
|  | bys zip: gen totOrder = \_n |
|  |  |
|  | gsort zip –res\_ratio |
|  | bys zip: gen resOrder = \_n |
|  |  |
|  | tw (hist res\_ratio if resOrder == 1), xtitle("Residential Address Percent") graphregion(color(white) lwidth(large)) |
|  | keep if resOrder == 1 |
|  |  |
|  | // some basic cleaning  |
|  | replace county\_name = "Oglala Lakota, SD" if county == "46102" |
|  | replace state = "SD" if county == "46102" |
|  | replace state\_fips = "46" if county == "46102" |
|  | replace county\_fips = "102" if county == "46102" |
|  |  |
|  | // see http://www.nws.noaa.gov/om/notification/scn17-57kusilvak\_ak.htm  |
|  | replace county\_name = "Kusilvak Census Area" if county == "02158" |
|  | replace state = "AK" if county == "02158" |
|  | replace state\_fips = "02" if county == "02158" |
|  | replace county\_fips = "158" if county == "02158" |
|  |  |
|  | drop resOrder totOrder |
|  | ren zip zip5 |
|  | destring, replace |
|  |  |
|  | tempfile zip5\_county |
|  | save `zip5\_county' |
|  |  |
|  |  |
|  | import delimited using "`basePath'/zcta\_county\_rel\_10.txt", delim(",") clear stringcols(\_all) // varnames(nonames)  |
|  |  |
|  |  \*\* 33120 unique ZCTA5  |
|  | ren state state\_fips\_rel |
|  | ren county county\_fips\_rel |
|  |  |
|  | gen county = state\_fips\_rel + county\_fips\_rel |
|  |  |
|  | merge m:1 county using `counties', update replace |
|  | tab \_merge |
|  | drop if \_merge == 2 |
|  | drop \_merge |
|  |  |
|  | destring, replace |
|  |  |
|  | ren state state\_rel |
|  | ren county county\_rel |
|  | ren county\_name county\_name\_rel |
|  | gen popabove50 = cond(zpoppct >= 50, 1, 0) |
|  |  |
|  | // apportionment: keep county matches with largest pop ZCTA5 share  |
|  |  |
|  | gsort zcta5 –poppt |
|  | bys zcta5: gen popOrder = \_n |
|  | keep if popOrder == 1 |
|  |  |
|  | ren zpop pop\_zip5 |
|  | ren zcta5 zip5 |
|  | keep zip5 pop\_zip5 zpoppct county\_rel state\_rel county\_name\_rel state\_fips\_rel county\_fips\_rel |
|  |  |
|  |  |
|  | merge 1:1 zip5 using `zip5\_county', update replace |
|  | tab \_merge |
|  | drop \_merge |
|  |  |
|  |  |
|  | // prioritize Census values, replace with HUD where missing  |
|  | foreach x in "county" "state\_fips" "county\_fips" { |
|  | gen `x'\_final = `x'\_rel |
|  | replace `x'\_final = `x' if `x'\_final == . |
|  | drop `x' |
|  | ren `x'\_final `x' |
|  | } |
|  |  |
|  | foreach x in "county\_name" "state" { |
|  | gen `x'\_final = `x'\_rel |
|  | replace `x'\_final = `x' if `x'\_final == "" |
|  | drop `x' |
|  | ren `x'\_final `x' |
|  | } |
|  |  |
|  | keep zip5 county county\_fips county\_name state state\_fips |
|  | save "`basePath'/ZIP5\_County\_Crosswalk", replace |

 [view raw](https://gist.github.com/a8dx/7e9d5af24101fc66aafa739577713b59/raw/bf4f4c31cf565aa920dd6a10a4522f8e5708ea40/ZIP5_County_Lookup.do) [ ZIP5\_County\_Lookup.do ](https://gist.github.com/a8dx/7e9d5af24101fc66aafa739577713b59#file-zip5_county_lookup-do) hosted with ❤ by [GitHub](https://github.com)