Today we will make this brilliant visualization in the NYT Article U.S. Virus Cases Climb Toward a Third Peak. Both the static visualization and the interactive range slider.

import pandas as pd
import geopandas as gpd
import altair as alt

alt.renderers.set_embed_options(actions=False)
RendererRegistry.enable('default')

Shapefiles for US States from the topojson/us-atlas repository. Shapefiles from US Census are fine too.

states_shp_uri = 'https://cdn.jsdelivr.net/npm/us-atlas@3/states-10m.json'
states = gpd.read_file(states_shp_uri)
states['lon'] = states['geometry'].centroid.x
states['lat'] = states['geometry'].centroid.y

We will remove the states that are not supported in the albersUsa projection.

# Albers USA projection does not support the following.
states = states[~((states['id'] == '69') | (states['id'] == '78') | (states['id'] == '60') | (states['id'] == '72') | (states['id'] == '66'))]
states.head()
id name geometry lon lat
0 01 Alabama MULTIPOLYGON (((-88.33102 30.23539, -88.13002 ... -86.828534 32.788722
1 02 Alaska MULTIPOLYGON (((-150.24276 61.13748, -150.2212... -152.219940 64.201799
2 04 Arizona POLYGON ((-114.71951 32.71893, -114.70157 32.7... -111.664784 34.292803
3 08 Colorado POLYGON ((-109.04843 41.00026, -108.17982 41.0... -105.548071 38.998238
4 12 Florida MULTIPOLYGON (((-80.75043 24.85767, -80.70018 ... -82.497311 28.620301

Initial Visualization

alt.Chart(states).mark_geoshape().encode().project('albersUsa')

Getting the COVID data as usual from NYT GitHub Repository

us_covid_data_uri = 'https://github.com/nytimes/covid-19-data/blob/master/us-counties.csv?raw=true'
raw_data = pd.read_csv(us_covid_data_uri)
raw_data['date'] = pd.to_datetime(raw_data['date'])
covid = raw_data.copy()
covid.head()
date county state fips cases deaths
0 2020-01-21 Snohomish Washington 53061.0 1 0
1 2020-01-22 Snohomish Washington 53061.0 1 0
2 2020-01-23 Snohomish Washington 53061.0 1 0
3 2020-01-24 Cook Illinois 17031.0 1 0
4 2020-01-24 Snohomish Washington 53061.0 1 0

We will also load the shapefile for counties because we need them for calculating the longitude and latitudes of the centroids of polygons of the counties

counties_shp_uri = 'https://cdn.jsdelivr.net/npm/us-atlas@3/counties-10m.json'
counties = gpd.read_file(counties_shp_uri)
counties.head()
id name geometry
0 04015 Mohave POLYGON ((-114.05190 36.84327, -114.05190 37.0...
1 22105 Tangipahoa POLYGON ((-90.56715 30.99995, -90.54921 30.999...
2 16063 Lincoln POLYGON ((-114.59389 43.19860, -114.37494 43.1...
3 27119 Polk POLYGON ((-97.14633 48.17341, -96.50026 48.174...
4 38017 Cass POLYGON ((-97.70626 47.23961, -97.45142 47.238...

Removing any empty geometries beacause that will give errors when calculating longitudes and latitudes from the centroid

counties.geometry.is_empty.any()
True
empty = counties.geometry.is_empty

counties = counties[~empty]
counties['lon'] = counties['geometry'].centroid.x
counties['lat'] = counties['geometry'].centroid.y
counties.head()
id name geometry lon lat
0 04015 Mohave POLYGON ((-114.05190 36.84327, -114.05190 37.0... -113.758228 35.704987
1 22105 Tangipahoa POLYGON ((-90.56715 30.99995, -90.54921 30.999... -90.404976 30.626271
2 16063 Lincoln POLYGON ((-114.59389 43.19860, -114.37494 43.1... -114.138249 43.002348
3 27119 Polk POLYGON ((-97.14633 48.17341, -96.50026 48.174... -96.401592 47.774206
4 38017 Cass POLYGON ((-97.70626 47.23961, -97.45142 47.238... -97.248351 46.933123
counties.rename(columns={'id': 'fips'}, inplace=True)
counties.fips = counties.fips.astype(int)

The usual NYC aggregation because of the way data is reported by NYT.

# Extract the boroughs in shapefile geodataframe for NYC
boroughs_nyc = counties[counties['fips'].isin(['36005', '36047', '36061', '36081', '36085'])]
boroughs_nyc['State'] = "NYC"
#combined_nyc = boroughs_nyc.dissolve(by='STATEFP')
agg_nyc_data = boroughs_nyc.dissolve(by='State').reset_index()
agg_nyc_data['fips'] = 1
agg_nyc_data['lon'] = agg_nyc_data['geometry'].centroid.x
agg_nyc_data['lat'] = agg_nyc_data['geometry'].centroid.y
agg_nyc_data = agg_nyc_data.drop(columns=['State'])

counties = gpd.GeoDataFrame(pd.concat([counties, agg_nyc_data], ignore_index=True))
counties.tail()
fips name geometry lon lat
3226 28001 Adams POLYGON ((-91.37833 31.73273, -91.31732 31.745... -91.353097 31.479837
3227 36069 Ontario POLYGON ((-77.58109 42.94432, -77.48418 42.943... -77.300689 42.852556
3228 54053 Mason POLYGON ((-82.10001 38.95828, -82.02822 39.028... -82.026453 38.768440
3229 4025 Yavapai POLYGON ((-113.33405 35.52805, -113.26226 35.5... -112.554378 34.599427
3230 1 New York MULTIPOLYGON (((-74.20356 40.59307, -74.20356 ... -73.925717 40.696604
covid = raw_data.copy()
covid = covid[~((covid['state'] == "Puerto Rico")|(covid['state'] == "Virgin Islands")|(covid['state'] == "Guam")|(covid['state'] == "Northern Mariana Islands"))]

# Extract the boroughs in shapefile geodataframe for NYC
boroughs_nyc = counties[counties['fips'].isin(['36005', '36047', '36061', '36081', '36085'])]
boroughs_nyc['State'] = "NYC"
#combined_nyc = boroughs_nyc.dissolve(by='STATEFP')
agg_nyc_data = boroughs_nyc.dissolve(by='State').reset_index()
agg_nyc_data['fips'] = 1
agg_nyc_data['lon'] = agg_nyc_data['geometry'].centroid.x
agg_nyc_data['lat'] = agg_nyc_data['geometry'].centroid.y
agg_nyc_data = agg_nyc_data.drop(columns=['State'])

counties = gpd.GeoDataFrame(pd.concat([counties, agg_nyc_data], ignore_index=True))

# Make fips in covid data for "New York City" as 1 to reflect what we have done above
covid.loc[covid['county'] == 'New York City','fips'] = 1
#covid = covid.merge(counties[['fips', 'lon', 'lat']], on='fips', how='inner')
#covid = covid.drop(columns=['STATEFP', 'geometry'])

# Getting per day statistics on CASES
covid = covid.assign(daily=covid.groupby('fips')['cases'].diff())
#covid = covid[covid['date'] > '2020-05-01'] # Working with data from May

# Getting past 2 weeks data for cases per given date - sum of cases in past 2 weeks
covid = covid.assign(past_2_weeks = covid.groupby('fips')['daily'].apply(lambda case: case.rolling(window=14).sum().shift(1)))

# Keeping only the latest date data
covid = covid[covid['date'] == '2020-10-27']

# Merging population estimate data - we could have done it earlier but what the heck!
## Getting census data
census = pd.read_csv('co-est2019-alldata.csv')
census = census[['STATE', 'COUNTY', 'STNAME', 'CTYNAME','POPESTIMATE2019']] # Keeping only 2019 population estimates
## Constructing FIPS from STATE and COUNTY codes in census itself
census['STATE'] = census.STATE.astype(str)
census['COUNTY'] = census.COUNTY.astype(str)
census['STATE'] = census.STATE.str.pad(2, fillchar='0')
census['COUNTY'] = census.COUNTY.str.pad(3, fillchar='0')
census = census.assign(fips = census.STATE + census.COUNTY)
census.fips = census.fips.astype(int)
census = census.drop(columns=['STATE','COUNTY','STNAME','CTYNAME'])
## Merging the census data with covid data on FIPS
covid = covid.merge(census, on='fips', how='inner')
## Getting the per capita data
covid = covid.assign(past_2_weeks_per_capita = (covid.past_2_weeks/covid.POPESTIMATE2019)*1000)
covid = covid[covid['past_2_weeks_per_capita'].notna()]

# Per Capita Deaths
covid = covid.assign(deaths_per_capita = covid.deaths/covid.POPESTIMATE2019*1000)

# Per Capita Deaths
covid = covid.assign(cases_per_capita = covid.cases/covid.POPESTIMATE2019*1000)

# Finally merging with geometry
plot_data = counties.merge(covid, on='fips', how='inner')
#plot_data = covid

#plot_data = plot_data.drop(columns=['STATEFP', 'daily', 'past_2_weeks', 'past_2_weeks_per_capita', 'deaths_per_capita'])
print(plot_data.info())
plot_data.head()
<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 3129 entries, 0 to 3128
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   fips                     3129 non-null   int64         
 1   name                     3129 non-null   object        
 2   geometry                 3129 non-null   geometry      
 3   lon                      3129 non-null   float64       
 4   lat                      3129 non-null   float64       
 5   date                     3129 non-null   datetime64[ns]
 6   county                   3129 non-null   object        
 7   state                    3129 non-null   object        
 8   cases                    3129 non-null   int64         
 9   deaths                   3129 non-null   int64         
 10  daily                    3129 non-null   float64       
 11  past_2_weeks             3129 non-null   float64       
 12  POPESTIMATE2019          3129 non-null   int64         
 13  past_2_weeks_per_capita  3129 non-null   float64       
 14  deaths_per_capita        3129 non-null   float64       
 15  cases_per_capita         3129 non-null   float64       
dtypes: datetime64[ns](1), float64(7), geometry(1), int64(4), object(3)
memory usage: 415.6+ KB
None
fips name geometry lon lat date county state cases deaths daily past_2_weeks POPESTIMATE2019 past_2_weeks_per_capita deaths_per_capita cases_per_capita
0 4015 Mohave POLYGON ((-114.05190 36.84327, -114.05190 37.0... -113.758228 35.704987 2020-10-27 Mohave Arizona 4313 230 41.0 183.0 212181 0.862471 1.083980 20.326985
1 22105 Tangipahoa POLYGON ((-90.56715 30.99995, -90.54921 30.999... -90.404976 30.626271 2020-10-27 Tangipahoa Louisiana 5004 126 21.0 176.0 134758 1.306045 0.935009 37.133231
2 16063 Lincoln POLYGON ((-114.59389 43.19860, -114.37494 43.1... -114.138249 43.002348 2020-10-27 Lincoln Idaho 199 1 13.0 81.0 5366 15.095043 0.186359 37.085352
3 27119 Polk POLYGON ((-97.14633 48.17341, -96.50026 48.174... -96.401592 47.774206 2020-10-27 Polk Minnesota 724 4 51.0 268.0 31364 8.544828 0.127535 23.083790
4 38017 Cass POLYGON ((-97.70626 47.23961, -97.45142 47.238... -97.248351 46.933123 2020-10-27 Cass North Dakota 8925 84 131.0 2259.0 181923 12.417341 0.461734 49.059217

Making a custom height column with SVG path strings so that we scale the data using that column

plot_data = plot_data.assign(height = plot_data['past_2_weeks_per_capita'].apply(lambda x: f"M -1 0 L 0 -{x*2} L 1 0"))

Plotting the data -

#collapse
base = alt.Chart(states).mark_geoshape(fill='#ededed', stroke='white').encode(text='name:N', longitude='lon:Q', latitude='lat:Q').project('albersUsa')

text = alt.Chart(states).mark_text().encode(text='name:N', longitude='lon:Q', latitude='lat:Q', tooltip=['name:N']).project('albersUsa')

spikes = alt.Chart(plot_data).mark_point(
        fillOpacity=0.5, 
        fill='red',
        strokeOpacity=1, 
        strokeWidth=1,
        stroke='red',
    ).encode(
        shape=alt.Shape('height', scale=None),
        longitude='lon:Q',
        latitude='lat:Q',
    ).project('albersUsa').properties(width=1200, height=900)

(base+text+spikes).configure_view(stroke=None)