Skip to content

openavmkit.utilities.census

CensusCredentials

CensusCredentials(api_key)

Object for storing US Census API credentials

Attributes:

Name Type Description
api_key str

API Key for the US Census

Initialize a CensusCredentials object

api_key : str API Key for the US Census

Source code in openavmkit/utilities/census.py
19
20
21
22
23
24
25
def __init__(self, api_key: str):
    """Initialize a CensusCredentials object

    api_key : str
        API Key for the US Census
    """
    self.api_key = api_key

CensusService

CensusService(credentials)

Provides functions for downloading data from the US Census

Attributes:

Name Type Description
credentials CensusCredentials

Credentials for the US Census

census_client Census

US Census API Client object

Initialize the CensusService object

Parameters:

Name Type Description Default
credentials CensusCredentials

Credentials for the US Census

required
Source code in openavmkit/utilities/census.py
40
41
42
43
44
45
46
47
48
49
def __init__(self, credentials: CensusCredentials):
    """Initialize the CensusService object

    Parameters
    ----------
    credentials : CensusCredentials
        Credentials for the US Census
    """
    self.credentials = credentials
    self.census_client = Census(credentials.api_key)

get_census_blockgroups_shapefile

get_census_blockgroups_shapefile(fips_code)

Get Census Block Group shapefiles for a given FIPS code from the Census TIGERweb service.

Parameters:

Name Type Description Default
fips_code str

5-digit FIPS code (state + county)

required

Returns:

Type Description
GeoDataFrame

GeoDataFrame containing Census Block Group boundaries

Raises:

Type Description
TypeError

If fips_code is not a string

ValueError

If fips_code is not 5 digits

RequestException

If API request fails

Source code in openavmkit/utilities/census.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def get_census_blockgroups_shapefile(self, fips_code: str) -> gpd.GeoDataFrame:
    """Get Census Block Group shapefiles for a given FIPS code from the Census TIGERweb service.

    Parameters
    ----------
    fips_code : str
        5-digit FIPS code (state + county)

    Returns
    -------
    gpd.GeoDataFrame
        GeoDataFrame containing Census Block Group boundaries

    Raises
    ------
    TypeError
        If fips_code is not a string
    ValueError
        If fips_code is not 5 digits
    requests.RequestException
        If API request fails
    """
    if not isinstance(fips_code, str):
        raise TypeError("fips_code must be a string")
    if len(fips_code) != 5:
        raise ValueError("fips_code must be 5 digits (state + county)")

    # TIGERweb REST API endpoint for block groups
    base_url = "https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/Tracts_Blocks/MapServer/1/query"

    # Query parameters
    params = {
        "where": f"STATE='{fips_code[:2]}' AND COUNTY='{fips_code[2:]}'",
        "outFields": "*",
        "returnGeometry": "true",
        "f": "geojson",
        "outSR": "4326",  # WGS84
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        geojson_data = response.json()

        # Convert to GeoDataFrame
        gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])

        # Create standardized GEOID components
        gdf["state_fips"] = gdf["STATE"]
        gdf["county_fips"] = gdf["COUNTY"]
        gdf["tract_fips"] = gdf["TRACT"]
        gdf["bg_fips"] = gdf["BLKGRP"]

        # Create standardized GEOID
        gdf["std_geoid"] = (
            gdf["state_fips"]
            + gdf["county_fips"]
            + gdf["tract_fips"]
            + gdf["bg_fips"]
        )

        # Explicitly set the CRS to EPSG:4326 (WGS84)
        gdf.crs = "EPSG:4326"

        return gdf

    except requests.RequestException as e:
        raise requests.RequestException(
            f"Failed to fetch Census Block Group data: {str(e)}"
        )

get_census_data

get_census_data(fips_code, year=2022)

Get Census demographic data for block groups in a given FIPS code.

Parameters:

Name Type Description Default
fips_code str

5-digit FIPS code (state + county)

required
year int

Census year to query (default: 2022)

2022

Returns:

Type Description
DataFrame

DataFrame containing Census demographic data

Raises:

Type Description
TypeError

If fips_code is not a string or year is not an int

ValueError

If fips_code is not 5 digits

Source code in openavmkit/utilities/census.py
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def get_census_data(self, fips_code: str, year: int = 2022) -> pd.DataFrame:
    """Get Census demographic data for block groups in a given FIPS code.

    Parameters
    ----------
    fips_code : str
        5-digit FIPS code (state + county)
    year : int
        Census year to query (default: 2022)

    Returns
    -------
    pd.DataFrame
        DataFrame containing Census demographic data

    Raises
    ------
    TypeError
        If fips_code is not a string or year is not an int
    ValueError
        If fips_code is not 5 digits
    """
    if not isinstance(fips_code, str):
        raise TypeError("fips_code must be a string")
    if not isinstance(year, int):
        raise TypeError("year must be an integer")
    if len(fips_code) != 5:
        raise ValueError("fips_code must be 5 digits (state + county)")

    # Split FIPS code into state and county
    state_fips = fips_code[:2]
    county_fips = fips_code[2:]

    # Get block group data
    data = self.census_client.acs5.state_county_blockgroup(
        fields=[
            "NAME",
            "B19013_001E",  # Median income
            "B01003_001E",
        ],  # Total population
        state_fips=state_fips,
        county_fips=county_fips,
        blockgroup="*",  # All block groups
        year=year,
    )

    # Convert to DataFrame
    df = pd.DataFrame(data)

    # Rename columns
    df = df.rename(
        columns={"B19013_001E": "median_income", "B01003_001E": "total_pop"}
    )

    # Create GEOID for block groups (state+county+tract+block group)
    df["state_fips"] = df["state"]
    df["county_fips"] = df["county"]
    df["tract_fips"] = df["tract"]
    df["bg_fips"] = df["block group"]

    # Create standardized GEOID
    df["std_geoid"] = (
        df["state_fips"] + df["county_fips"] + df["tract_fips"] + df["bg_fips"]
    )

    return df

get_census_data_with_boundaries

get_census_data_with_boundaries(fips_code, year=2022)

Get both Census demographic data and boundary files for block groups in a FIPS code.

Parameters:

Name Type Description Default
fips_code str

5-digit FIPS code (state + county)

required
year int

Census year to query (default: 2022)

2022

Returns:

Type Description
Tuple[pd.DataFrame, gpd.GeoDataFrame]:
  • Census demographic data DataFrame
  • Census Block Group boundaries GeoDataFrame

Raises:

Type Description
TypeError

If inputs have wrong types

ValueError

If inputs have invalid values

RequestException

If API requests fail

Source code in openavmkit/utilities/census.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def get_census_data_with_boundaries(
    self, fips_code: str, year: int = 2022
) -> Tuple[pd.DataFrame, gpd.GeoDataFrame]:
    """Get both Census demographic data and boundary files for block groups in a
    FIPS code.

    Parameters
    ----------
    fips_code :str
        5-digit FIPS code (state + county)
    year : int
        Census year to query (default: 2022)

    Returns
    -------
    Tuple[pd.DataFrame, gpd.GeoDataFrame]:

        - Census demographic data DataFrame
        - Census Block Group boundaries GeoDataFrame

    Raises
    ------
    TypeError
        If inputs have wrong types
    ValueError
        If inputs have invalid values
    requests.RequestException
        If API requests fail
    """
    # Get demographic data first
    census_data = self.get_census_data(fips_code, year)
    # Get the list of block groups we have data for
    valid_block_groups = census_data["std_geoid"].unique()

    # Get boundary files
    census_boundaries = self.get_census_blockgroups_shapefile(fips_code)
    # Filter boundaries to only include block groups we have data for
    census_boundaries = census_boundaries[
        census_boundaries["std_geoid"].isin(valid_block_groups)
    ]

    # Merge demographic data with boundaries
    census_boundaries = census_boundaries.merge(
        census_data, on="std_geoid", how="left"
    )
    # Verify the merge
    missing_geoids = census_boundaries[census_boundaries.isna().any(axis=1)][
        "std_geoid"
    ].unique()
    if len(missing_geoids) > 0:
        print(
            f"\nWarning: Found {len(missing_geoids)} block groups with missing data"
        )
        print("First few missing GEOIDs:", missing_geoids[:5])

    return census_data, census_boundaries

get_creds_from_env_census

get_creds_from_env_census()

Get Census credentials from environment variables.

Returns:

Type Description
CensusCredentials

Census API credentials

Raises:

Type Description
ValueError

If required environment variables are missing

Source code in openavmkit/utilities/census.py
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
def get_creds_from_env_census() -> CensusCredentials:
    """Get Census credentials from environment variables.

    Returns
    -------
    CensusCredentials
        Census API credentials

    Raises
    ------
    ValueError
        If required environment variables are missing
    """
    api_key = os.getenv("CENSUS_API_KEY")
    if not api_key:
        raise ValueError("Missing Census API key in environment.")
    return CensusCredentials(api_key)

init_service_census

init_service_census(credentials)

Initialize a Census service with the provided credentials.

Parameters:

Name Type Description Default
credentials CensusCredentials

Census API credentials

required

Returns:

Type Description
CensusService

Initialized Census service

Raises:

Type Description
ValueError

If credentials are invalid

Source code in openavmkit/utilities/census.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def init_service_census(credentials: CensusCredentials) -> CensusService:
    """Initialize a Census service with the provided credentials.

    Parameters
    ----------
    credentials : CensusCredentials
        Census API credentials

    Returns
    -------
    CensusService
        Initialized Census service

    Raises
    ------
    ValueError
        If credentials are invalid
    """
    if not isinstance(credentials, CensusCredentials):
        raise ValueError("Invalid credentials for Census service.")
    return CensusService(credentials)

match_to_census_blockgroups

match_to_census_blockgroups(gdf, census_gdf, join_type='left')

Match each row in a GeoDataFrame to its corresponding Census Block Group using spatial join.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input GeoDataFrame to match

required
census_gdf GeoDataFrame

Census Block Group boundaries GeoDataFrame

required
join_type str

Type of join to perform ('left', 'right', 'inner', 'outer')

'left'

Returns:

Type Description
GeoDataFrame

Input GeoDataFrame with Census Block Group data appended

Source code in openavmkit/utilities/census.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
def match_to_census_blockgroups(
    gdf: gpd.GeoDataFrame, census_gdf: gpd.GeoDataFrame, join_type: str = "left"
) -> gpd.GeoDataFrame:
    """Match each row in a GeoDataFrame to its corresponding Census Block Group using
    spatial join.

    Parameters
    ----------
    gdf : gpd.GeoDataFrame
        Input GeoDataFrame to match
    census_gdf : gpd.GeoDataFrame
        Census Block Group boundaries GeoDataFrame
    join_type : str
        Type of join to perform ('left', 'right', 'inner', 'outer')

    Returns
    -------
    gpd.GeoDataFrame
        Input GeoDataFrame with Census Block Group data appended
    """
    if not isinstance(gdf, gpd.GeoDataFrame):
        raise TypeError("gdf must be a GeoDataFrame")
    if not isinstance(census_gdf, gpd.GeoDataFrame):
        raise TypeError("census_gdf must be a GeoDataFrame")
    if join_type not in ["left", "right", "inner", "outer"]:
        raise ValueError("join_type must be one of: 'left', 'right', 'inner', 'outer'")

    # Create a copy of the input GeoDataFrame to avoid modifying the original
    gdf_for_join = gdf.copy()

    # Store original geometry column name
    orig_geom_col = gdf_for_join.geometry.name

    # If the data is in a geographic CRS (like WGS84/EPSG:4326),
    # reproject to a projected CRS before calculating centroids
    if gdf_for_join.crs.is_geographic:
        # Use the geometry utility to get an appropriate equal-area CRS based on the data
        projected_crs = get_crs(gdf_for_join, "equal_area")
        gdf_for_join = gdf_for_join.to_crs(projected_crs)
        census_gdf = census_gdf.to_crs(projected_crs)

    # Calculate centroids in the projected CRS
    gdf_for_join["centroid"] = gdf_for_join.geometry.centroid

    # Create a temporary GeoDataFrame with centroids for the spatial join
    centroid_gdf = gpd.GeoDataFrame(
        gdf_for_join.drop(columns=[orig_geom_col]),
        geometry="centroid",
        crs=gdf_for_join.crs,
    )

    # Perform the spatial join
    joined = centroid_gdf.sjoin(census_gdf, predicate="intersects", how=join_type)

    # If we have matches, process them
    if not joined.empty:
        # Calculate areas for each match
        joined["area"] = joined.geometry.area

        # Group by the index and find the smallest area for each
        smallest_areas = joined.groupby(level=0)["area"].idxmin()
        joined = joined.loc[smallest_areas]

        try:
            joined = joined.set_geometry(orig_geom_col)
        except:
            joined = joined.set_geometry("centroid")

        # Calculate and print percentage of records with valid census geoid
        valid_geoid_count = joined["std_geoid"].notna().sum()
        valid_percentage = (valid_geoid_count / len(gdf)) * 100
        print(
            f"Census block group matching: {valid_geoid_count} of {len(gdf)} records have valid census geoid ({valid_percentage:.2f}%)"
        )

        return joined
    else:
        # No matches, so just return the original GeoDataFrame with census columns added (all NaN)
        census_columns = ["std_geoid", "median_income", "total_pop"]
        for col in census_columns:
            if col in census_gdf.columns:
                gdf[col] = None

        print(f"Census block group matching: 0 of {len(gdf)} records matched (0.00%)")
        return gdf