My github repositories

Fuzzy Wuzzy String Matching

| | \# — DistrictNameMatching.py | |---|---| | | \# Author: Anthony Louis D'Agostino (ald2187 \[at\] columbia.edu) | | | \# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library | | | \# Notes: Default number of matches currently set to 3, though can be modified as input argument. | | | | | | import os | | | import numpy as np | | | from fuzzywuzzy import fuzz | | | from fuzzywuzzy import process | | | import pandas as pd | | | | | | | | | | | | def districtMatch(master, using, master\_dist, master\_state, using\_dist, using\_state, outFile, num\_match = 3): | | | """ | | | This function takes two sets of district-state names, and produces a DTA with a set number (default=3) | | | of matches with a flag for whether the district name has been completely matched. | | | | | | Manual work is then required for districts where a perfect match has not been made. | | | | | | master: file containing the master list of districts | | | using: file containing using list of districts, eg., each of these districts is compared against these | | | universe of master districts from the master file | | | master\_dist: variable name pertaining to districts in master file | | | master\_state: variable name pertaining to states in master file | | | using\_dist: variable name pertaining to districts in using file | | | using\_state: variable name pertaining to states in using file | | | num\_match: number of matches generated, default is 3 | | | outFile: includes path and filename for an outputted DTA file – should be "\*.dta" | | | """ | | | | | | master\_dists = pd.read\_csv(master, quotechar = '"', skipinitialspace = True, sep = None) | | | print " \*\*\* Now printing column values for master file \*\*\* " | | | print list(master\_dists.columns.values) | | | | | | using\_dists = pd.read\_csv(using, quotechar = '"', skipinitialspace = True, sep = None) | | | print " \*\*\* Now printing column values for using file \*\*\* " | | | print list(using\_dists.columns.values) | | | | | | \# — concatenate district and state names | | | master\_names = master\_dists\[master\_dist\].map(str) + ", " + master\_dists\[master\_state\] | | | using\_names = using\_dists\[using\_dist\].map(str) + ", " + using\_dists\[using\_state\] | | | | | | fhp\_new = \[process.extract(x, master\_names, limit=num\_match) for x in using\_names\] | | | | | | \# — generate column names | | | lab = "" | | | i = 1 | | | while i <= num\_match: | | | lab = lab + " " + "Match" + str(i) | | | i += 1 | | | | | | | | | | | | fhp\_matches = pd.DataFrame(fhp\_new, columns = lab.split()) | | | | | | d={} | | | for x in range(1,num\_match+1): | | | d\["Match{0}".format(x)\]=\[y\[0\] for y in fhp\_matches\['Match'+str(x)\]\] | | | | | | | | | d\['using\_original'\] = using\_names | | | | | | | | | \#match1 = \[x\[0\] for x in fhp\_matches\['Match1'\]\] | | | d\['perfect\_match'\] = d\['Match1'\] == d\['using\_original'\] | | | | | | \#fhp\_matches\['perfect\_match'\] = pd.Series(perfect\_match, index = fhp\_matches.index) | | | out = pd.DataFrame(d) | | | \#out.to\_stata(str(outFile + ".dta")) | | | out.to\_csv(str(outFile + ".csv")) | | | print "\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*" | | | print "\*\*\* Your analysis has been completed! \*\*\* " | | | print "\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*\*" | | | | | | return out | | | | | | | | | """ | | | BASIC FILES/PATHS WHOSE USE IS REPEATED | | | """ | | | | | | | | | baseDir = os.path.join("<insert path>") | | | | | | | | | outDir = os.path.join(baseDir, "Matched\_Results") | | | | | | if not os.path.exists(outDir): | | | os.makedirs(outDir) | | | | | | | | | | | | | | | | | | """ | | | ICRISAT and 1971 Polygon borders | | | """ | | | | | | master\_file = os.path.join(baseDir, "ICRISAT\_Meso\_Names.csv") | | | input\_1971 = os.path.join(baseDir,"District\_Records\_1971\_Clean.csv") | | | | | | outFile = os.path.join(outDir, "ICRI\_1971\_DistrictMatches") | | | icri\_1971\_match = districtMatch(input\_1971, master\_file, 'NAME', 'STATE\_UT', 'distname', 'stname', outFile) | | | | | | \# — alternatively, don't save as a workspace object | | | districtMatch(input\_1971, master\_file, 'NAME', 'STATE\_UT', 'distname', 'stname', outFile) | | | |
[view raw](https://gist.github.com/a8dx/20caedaf942f1810e16994bfdb57e8dc/raw/ec412554285823b873e37d523142082d79cd36d5/DistrictNameMatching.py) [ DistrictNameMatching.py ](https://gist.github.com/a8dx/20caedaf942f1810e16994bfdb57e8dc#file-districtnamematching-py) hosted with ❤ by [GitHub](https://github.com)

ZIP5 – County Lookup

| | /\* | |---|---| | | Filename: ZIP5\_County\_Lookup.do | | | Author: Anthony D'Agostino (ald at satanford dot edu) | | | Date Created: 02/09/2018 | | | Last Edited: 03/27/2018 | | | Purpose: Generate a merged zip5-county crosswalk that leverages the strengths of both the US Census and HUD ZCTA crosswalks. | | | | | | \*/ | | | | | | | | | clear | | | clear all | | | clear matrix | | | set more off | | | set maxvar 25000, permanently | | | | | | | | | | | | loc basePath "<your path here>" | | | | | | | | | // read in county codes, downloaded from: https://www.census.gov/geo/reference/codes/cou.html | | | import delimited using "`basePath'/national\_county.txt", delim(",") clear stringcols(\_all) varnames(nonames) | | | | | | ren v1 state | | | ren v2 state\_fips | | | ren v3 county\_fips | | | ren v4 county\_name | | | ren v5 fipsclasscode | | | | | | gen county = state\_fips + county\_fips | | | | | | tempfile counties | | | save `counties' | | | | | | \*import excel using "`basePath'/ZIP\_COUNTY\_032011.xlsx", firstrow clear | | | import excel using "`basePath'/ZIP\_COUNTY\_122017.xlsx", firstrow clear | | | | | | // 39455 unique entries before any transformations | | | merge m:1 county using `counties', update replace | | | tab \_merge | | | drop if \_merge == 2 | | | drop \_merge | | | | | | gsort zip tot\_ratio | | | bys zip: gen totOrder = \_n | | | | | | gsort zip res\_ratio | | | bys zip: gen resOrder = \_n | | | | | | tw (hist res\_ratio if resOrder == 1), xtitle("Residential Address Percent") graphregion(color(white) lwidth(large)) | | | keep if resOrder == 1 | | | | | | // some basic cleaning | | | replace county\_name = "Oglala Lakota, SD" if county == "46102" | | | replace state = "SD" if county == "46102" | | | replace state\_fips = "46" if county == "46102" | | | replace county\_fips = "102" if county == "46102" | | | | | | // see http://www.nws.noaa.gov/om/notification/scn17-57kusilvak\_ak.htm | | | replace county\_name = "Kusilvak Census Area" if county == "02158" | | | replace state = "AK" if county == "02158" | | | replace state\_fips = "02" if county == "02158" | | | replace county\_fips = "158" if county == "02158" | | | | | | drop resOrder totOrder | | | ren zip zip5 | | | destring, replace | | | | | | tempfile zip5\_county | | | save `zip5\_county' | | | | | | | | | import delimited using "`basePath'/zcta\_county\_rel\_10.txt", delim(",") clear stringcols(\_all) // varnames(nonames) | | | | | | \*\* 33120 unique ZCTA5 | | | ren state state\_fips\_rel | | | ren county county\_fips\_rel | | | | | | gen county = state\_fips\_rel + county\_fips\_rel | | | | | | merge m:1 county using `counties', update replace | | | tab \_merge | | | drop if \_merge == 2 | | | drop \_merge | | | | | | destring, replace | | | | | | ren state state\_rel | | | ren county county\_rel | | | ren county\_name county\_name\_rel | | | gen popabove50 = cond(zpoppct >= 50, 1, 0) | | | | | | // apportionment: keep county matches with largest pop ZCTA5 share | | | | | | gsort zcta5 poppt | | | bys zcta5: gen popOrder = \_n | | | keep if popOrder == 1 | | | | | | ren zpop pop\_zip5 | | | ren zcta5 zip5 | | | keep zip5 pop\_zip5 zpoppct county\_rel state\_rel county\_name\_rel state\_fips\_rel county\_fips\_rel | | | | | | | | | merge 1:1 zip5 using `zip5\_county', update replace | | | tab \_merge | | | drop \_merge | | | | | | | | | // prioritize Census values, replace with HUD where missing | | | foreach x in "county" "state\_fips" "county\_fips" { | | | gen `x'\_final = `x'\_rel | | | replace `x'\_final = `x' if `x'\_final == . | | | drop `x' | | | ren `x'\_final `x' | | | } | | | | | | foreach x in "county\_name" "state" { | | | gen `x'\_final = `x'\_rel | | | replace `x'\_final = `x' if `x'\_final == "" | | | drop `x' | | | ren `x'\_final `x' | | | } | | | | | | keep zip5 county county\_fips county\_name state state\_fips | | | save "`basePath'/ZIP5\_County\_Crosswalk", replace |
[view raw](https://gist.github.com/a8dx/7e9d5af24101fc66aafa739577713b59/raw/bf4f4c31cf565aa920dd6a10a4522f8e5708ea40/ZIP5_County_Lookup.do) [ ZIP5\_County\_Lookup.do ](https://gist.github.com/a8dx/7e9d5af24101fc66aafa739577713b59#file-zip5_county_lookup-do) hosted with ❤ by [GitHub](https://github.com)