Code
Fuzzy Wuzzy String Matching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. [Learn more about bidirectional Unicode characters](https://github.co/hiddenchars)
| | \# — DistrictNameMatching.py |
|---|---|
| | \# Author: Anthony Louis D'Agostino (ald2187 \[at\] columbia.edu) |
| | \# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library |
| | \# Notes: Default number of matches currently set to 3, though can be modified as input argument. |
| | |
| | import os |
| | import numpy as np |
| | from fuzzywuzzy import fuzz |
| | from fuzzywuzzy import process |
| | import pandas as pd |
| | |
| | |
| | |
| | def districtMatch(master, using, master\_dist, master\_state, using\_dist, using\_state, outFile, num\_match = 3): |
| | """ |
| | This function takes two sets of district-state names, and produces a DTA with a set number (default=3) |
| | of matches with a flag for whether the district name has been completely matched. |
| | |
| | Manual work is then required for districts where a perfect match has not been made. |
| | |
| | master: file containing the master list of districts |
| | using: file containing using list of districts, eg., each of these districts is compared against these |
| | universe of master districts from the master file |
| | master\_dist: variable name pertaining to districts in master file |
| | master\_state: variable name pertaining to states in master file |
| | using\_dist: variable name pertaining to districts in using file |
| | using\_state: variable name pertaining to states in using file |
| | num\_match: number of matches generated, default is 3 |
| | outFile: includes path and filename for an outputted DTA file – should be "\*.dta" |
| | """ |
| | |
| | master\_dists = pd.read\_csv(master, quotechar = '"', skipinitialspace = True, sep = None) |
| | print " \*\*\* Now printing column values for master file \*\*\* " |
| | print list(master\_dists.columns.values) |
| | |
| | using\_dists = pd.read\_csv(using, quotechar = '"', skipinitialspace = True, sep = None) |
| | print " \*\*\* Now printing column values for using file \*\*\* " |
| | print list(using\_dists.columns.values) |
| | |
| | \# — concatenate district and state names |
| | master\_names = master\_dists\[master\_dist\].map(str) + ", " + master\_dists\[master\_state\] |
| | using\_names = using\_dists\[using\_dist\].map(str) + ", " + using\_dists\[using\_state\] |
| | |
| | fhp\_new = \[process.extract(x, master\_names, limit=num\_match) for x in using\_names\] |
| | |
| | \# — generate column names |
| | lab = "" |
| | i = 1 |
| | while i <= num\_match: |
| | lab = lab + " " + "Match" + str(i) |
| | i += 1 |
| | |
| | |
| | |
| | fhp\_matches = pd.DataFrame(fhp\_new, columns = lab.split()) |
| | |
| | d={} |
| | for x in range(1,num\_match+1): |
| | d\["Match{0}".format(x)\]=\[y\[0\] for y in fhp\_matches\['Match'+str(x)\]\] |
| | |
| | |
| | d\['using\_original'\] = using\_names |
| | |
| | |
| | \#match1 = \[x\[0\] for x in fhp\_matches\['Match1'\]\] |
| | d\['perfect\_match'\] = d\['Match1'\] == d\['using\_original'\] |
| | |
| | \#fhp\_matches\['perfect\_match'\] = pd.Series(perfect\_match, index = fhp\_matches.index) |
| | out = pd.DataFrame(d) |
| | \#out.to\_stata(str(outFile + ".dta")) |
| | out.to\_csv(str(outFile + ".csv")) |
| | print "\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*" |
| | print "\*\*\* Your analysis has been completed! \*\*\* " |
| | print "\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*" |
| | |
| | return out |
| | |
| | |
| | """ |
| | BASIC FILES/PATHS WHOSE USE IS REPEATED |
| | """ |
| | |
| | |
| | baseDir = os.path.join("<insert path>") |
| | |
| | |
| | outDir = os.path.join(baseDir, "Matched\_Results") |
| | |
| | if not os.path.exists(outDir): |
| | os.makedirs(outDir) |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | ICRISAT and 1971 Polygon borders |
| | """ |
| | |
| | master\_file = os.path.join(baseDir, "ICRISAT\_Meso\_Names.csv") |
| | input\_1971 = os.path.join(baseDir,"District\_Records\_1971\_Clean.csv") |
| | |
| | outFile = os.path.join(outDir, "ICRI\_1971\_DistrictMatches") |
| | icri\_1971\_match = districtMatch(input\_1971, master\_file, 'NAME', 'STATE\_UT', 'distname', 'stname', outFile) |
| | |
| | \# — alternatively, don't save as a workspace object |
| | districtMatch(input\_1971, master\_file, 'NAME', 'STATE\_UT', 'distname', 'stname', outFile) |
| | |
[ Show hidden characters ](<>)
[view raw](https://gist.github.com/a8dx/20caedaf942f1810e16994bfdb57e8dc/raw/ec412554285823b873e37d523142082d79cd36d5/DistrictNameMatching.py) [ DistrictNameMatching.py ](https://gist.github.com/a8dx/20caedaf942f1810e16994bfdb57e8dc#file-districtnamematching-py) hosted with ❤ by [GitHub](https://github.com)
ZIP5 – County Lookup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. [Learn more about bidirectional Unicode characters](https://github.co/hiddenchars)
| | /\* |
|---|---|
| | Filename: ZIP5\_County\_Lookup.do |
| | Author: Anthony D'Agostino (ald at satanford dot edu) |
| | Date Created: 02/09/2018 |
| | Last Edited: 03/27/2018 |
| | Purpose: Generate a merged zip5-county crosswalk that leverages the strengths of both the US Census and HUD ZCTA crosswalks. |
| | |
| | \*/ |
| | |
| | |
| | clear |
| | clear all |
| | clear matrix |
| | set more off |
| | set maxvar 25000, permanently |
| | |
| | |
| | |
| | loc basePath "<your path here>" |
| | |
| | |
| | // read in county codes, downloaded from: https://www.census.gov/geo/reference/codes/cou.html |
| | import delimited using "`basePath'/national\_county.txt", delim(",") clear stringcols(\_all) varnames(nonames) |
| | |
| | ren v1 state |
| | ren v2 state\_fips |
| | ren v3 county\_fips |
| | ren v4 county\_name |
| | ren v5 fipsclasscode |
| | |
| | gen county = state\_fips + county\_fips |
| | |
| | tempfile counties |
| | save `counties' |
| | |
| | \*import excel using "`basePath'/ZIP\_COUNTY\_032011.xlsx", firstrow clear |
| | import excel using "`basePath'/ZIP\_COUNTY\_122017.xlsx", firstrow clear |
| | |
| | // 39455 unique entries before any transformations |
| | merge m:1 county using `counties', update replace |
| | tab \_merge |
| | drop if \_merge == 2 |
| | drop \_merge |
| | |
| | gsort zip –tot\_ratio |
| | bys zip: gen totOrder = \_n |
| | |
| | gsort zip –res\_ratio |
| | bys zip: gen resOrder = \_n |
| | |
| | tw (hist res\_ratio if resOrder == 1), xtitle("Residential Address Percent") graphregion(color(white) lwidth(large)) |
| | keep if resOrder == 1 |
| | |
| | // some basic cleaning |
| | replace county\_name = "Oglala Lakota, SD" if county == "46102" |
| | replace state = "SD" if county == "46102" |
| | replace state\_fips = "46" if county == "46102" |
| | replace county\_fips = "102" if county == "46102" |
| | |
| | // see http://www.nws.noaa.gov/om/notification/scn17-57kusilvak\_ak.htm |
| | replace county\_name = "Kusilvak Census Area" if county == "02158" |
| | replace state = "AK" if county == "02158" |
| | replace state\_fips = "02" if county == "02158" |
| | replace county\_fips = "158" if county == "02158" |
| | |
| | drop resOrder totOrder |
| | ren zip zip5 |
| | destring, replace |
| | |
| | tempfile zip5\_county |
| | save `zip5\_county' |
| | |
| | |
| | import delimited using "`basePath'/zcta\_county\_rel\_10.txt", delim(",") clear stringcols(\_all) // varnames(nonames) |
| | |
| | \*\* 33120 unique ZCTA5 |
| | ren state state\_fips\_rel |
| | ren county county\_fips\_rel |
| | |
| | gen county = state\_fips\_rel + county\_fips\_rel |
| | |
| | merge m:1 county using `counties', update replace |
| | tab \_merge |
| | drop if \_merge == 2 |
| | drop \_merge |
| | |
| | destring, replace |
| | |
| | ren state state\_rel |
| | ren county county\_rel |
| | ren county\_name county\_name\_rel |
| | gen popabove50 = cond(zpoppct >= 50, 1, 0) |
| | |
| | // apportionment: keep county matches with largest pop ZCTA5 share |
| | |
| | gsort zcta5 –poppt |
| | bys zcta5: gen popOrder = \_n |
| | keep if popOrder == 1 |
| | |
| | ren zpop pop\_zip5 |
| | ren zcta5 zip5 |
| | keep zip5 pop\_zip5 zpoppct county\_rel state\_rel county\_name\_rel state\_fips\_rel county\_fips\_rel |
| | |
| | |
| | merge 1:1 zip5 using `zip5\_county', update replace |
| | tab \_merge |
| | drop \_merge |
| | |
| | |
| | // prioritize Census values, replace with HUD where missing |
| | foreach x in "county" "state\_fips" "county\_fips" { |
| | gen `x'\_final = `x'\_rel |
| | replace `x'\_final = `x' if `x'\_final == . |
| | drop `x' |
| | ren `x'\_final `x' |
| | } |
| | |
| | foreach x in "county\_name" "state" { |
| | gen `x'\_final = `x'\_rel |
| | replace `x'\_final = `x' if `x'\_final == "" |
| | drop `x' |
| | ren `x'\_final `x' |
| | } |
| | |
| | keep zip5 county county\_fips county\_name state state\_fips |
| | save "`basePath'/ZIP5\_County\_Crosswalk", replace |
[ Show hidden characters ](<>)
[view raw](https://gist.github.com/a8dx/7e9d5af24101fc66aafa739577713b59/raw/bf4f4c31cf565aa920dd6a10a4522f8e5708ea40/ZIP5_County_Lookup.do) [ ZIP5\_County\_Lookup.do ](https://gist.github.com/a8dx/7e9d5af24101fc66aafa739577713b59#file-zip5_county_lookup-do) hosted with ❤ by [GitHub](https://github.com)