Skip to content

openavmkit.utilities.data

add_sqft_fields

add_sqft_fields(df_in)

Add per-square-foot fields to the DataFrame for land and improvement values.

This function creates new columns based on existing value fields and area fields.

Parameters:

Name Type Description Default
df_in DataFrame

Input DataFrame

required

Returns:

Type Description
DataFrame

DataFrame with additional per-square-foot-fields

Source code in openavmkit/utilities/data.py
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
def add_sqft_fields(df_in: pd.DataFrame):
    """Add per-square-foot fields to the DataFrame for land and improvement values.

    This function creates new columns based on existing value fields and area fields.

    Parameters
    ----------
    df_in : pd.DataFrame
        Input DataFrame

    Returns
    -------
    pd.DataFrame
        DataFrame with additional per-square-foot-fields
    """
    df = df_in.copy()
    land_sqft = [
        "model_market_value",
        "model_land_value",
        "assr_market_value",
        "assr_land_value",
    ]
    impr_sqft = [
        "model_market_value",
        "model_impr_value",
        "assr_market_value",
        "assr_impr_value",
    ]
    for field in land_sqft:
        if field in df:
            df[field + "_land_sqft"] = div_series_z_safe(
                df[field], df["land_area_sqft"]
            )
    for field in impr_sqft:
        if field in df:
            df[field + "_impr_sqft"] = div_series_z_safe(
                df[field], df["bldg_area_finished_sqft"]
            )
    return df

align_categories

align_categories(df_left, df_right)

Ensure matching categorical dtypes and unified category sets across two DataFrames.

For each column present in either DataFrame, if either side has a pandas Categorical dtype, this function will:

  1. Convert the other side's column to Categorical (if not already), using the first side's existing categories.
  2. Compute the union of both categorical sets (preserving order: first df_left's then any new from df_right) and assign this combined set to both DataFrames.

Parameters:

Name Type Description Default
df_left DataFrame

First DataFrame whose categorical columns will be aligned.

required
df_right DataFrame

Second DataFrame whose categorical columns will be aligned.

required

Returns:

Name Type Description
left_aligned DataFrame

Copy of df_left where any column that was categorical in either input is now Categorical with the union of both category sets.

right_aligned DataFrame

Copy of df_right similarly adjusted to share the same categories.

Notes
  • Columns not of Categorical dtype on either side remain unchanged.
  • Missing values are preserved and treated as NaN in the Categorical dtype.
  • The original column order and non-categorical columns are unaffected.
Source code in openavmkit/utilities/data.py
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
def align_categories(
    df_left: pd.DataFrame, df_right: pd.DataFrame
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Ensure matching categorical dtypes and unified category sets across two DataFrames.

    For each column present in either DataFrame, if *either* side has a
    pandas Categorical dtype, this function will:

    1. Convert the other side's column to Categorical (if not already), using
       the first side's existing categories.
    2. Compute the union of both categorical sets (preserving order: first
       df_left's then any new from df_right) and assign this combined set
       to both DataFrames.

    Parameters
    ----------
    df_left : pandas.DataFrame
        First DataFrame whose categorical columns will be aligned.
    df_right : pandas.DataFrame
        Second DataFrame whose categorical columns will be aligned.

    Returns
    -------
    left_aligned : pandas.DataFrame
        Copy of `df_left` where any column that was categorical in either input
        is now Categorical with the union of both category sets.
    right_aligned : pandas.DataFrame
        Copy of `df_right` similarly adjusted to share the same categories.

    Notes
    -----
    * Columns not of Categorical dtype on either side remain unchanged.
    * Missing values are preserved and treated as NaN in the Categorical dtype.
    * The original column order and non-categorical columns are unaffected.
    """

    for col in df_left.columns.union(df_right.columns):

        left_is_cat = isinstance(
            df_left.get(col, pd.Series(dtype="object")).dtype, pd.CategoricalDtype
        )
        right_is_cat = isinstance(
            df_right.get(col, pd.Series(dtype="object")).dtype, pd.CategoricalDtype
        )

        # If exactly one side is categorical, convert the other side first
        if left_is_cat and not right_is_cat:
            df_right[col] = pd.Categorical(
                df_right[col], categories=df_left[col].cat.categories
            )
            right_is_cat = True
        elif right_is_cat and not left_is_cat:
            df_left[col] = pd.Categorical(
                df_left[col], categories=df_right[col].cat.categories
            )
            left_is_cat = True

        # Now, if both are categorical, give them the same (union) category list
        if left_is_cat and right_is_cat:
            cats = df_left[col].cat.categories.union(df_right[col].cat.categories)
            df_left[col] = df_left[col].cat.set_categories(cats)
            df_right[col] = df_right[col].cat.set_categories(cats)

    return df_left, df_right

calc_spatial_lag

calc_spatial_lag(df_sample, df_univ, value_fields, neighbors=5, exclude_self_in_sample=False)

Compute spatial lag features via Gaussian-weighted averages of nearest neighbors.

Builds a cKDTree on the coordinates in df_sample and, for each location in df_univ, finds its neighbors nearest points in df_sample. A spatial lag is calculated for each field in value_fields as the weighted mean of the neighbor values using a Gaussian kernel with bandwidth equal to the mean neighbor distance (σ) for each prediction point. Missing or zero distances are handled to avoid division by zero. Optionally excludes the point itself when computing its own lag.

Parameters:

Name Type Description Default
df_sample DataFrame

DataFrame of sample points containing at least columns 'latitude', 'longitude', and each field in value_fields. Used to train the nearest-neighbor tree and source values for lag computation.

required
df_univ DataFrame

DataFrame of prediction points containing 'latitude' and 'longitude'. May include additional columns; output will append lag columns to this.

required
value_fields list of str

List of column names in df_sample whose spatial lags will be computed.

required
neighbors int

Number of nearest neighbors to query for each prediction point. Must be at least 2 to allow exclusion of self when exclude_self_in_sample=True.

5
exclude_self_in_sample bool

If True, the nearest neighbor at distance zero (self) is excluded from the lag calculation by dropping the first neighbor in the query results.

False

Returns:

Type Description
DataFrame

A copy of df_univ with new columns named 'spatial_lag_' for each requested field. Missing lag values are filled with the median of the corresponding field in df_sample.

Raises:

Type Description
ValueError

If neighbors < 2, since at least two neighbors are required to compute a spatial lag (especially when excluding the self-distance).

Notes
  • Uses SciPy’s cKDTree for efficient nearest-neighbor lookup.
  • Gaussian kernel weights are computed as: exp(–(d_ij²) / (2 · σ_i²)) , where d_ij is the distance from point i to neighbor j, and σ_i is the mean of its k neighbor distances.

  • Weights are then normalized so that they sum to 1 for each prediction point.

Source code in openavmkit/utilities/data.py
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
def calc_spatial_lag(
    df_sample: pd.DataFrame,
    df_univ: pd.DataFrame,
    value_fields: list[str],
    neighbors: int = 5,
    exclude_self_in_sample: bool = False,
) -> pd.DataFrame:
    """Compute spatial lag features via Gaussian-weighted averages of nearest neighbors.

    Builds a cKDTree on the coordinates in `df_sample` and, for each location in
    `df_univ`, finds its `neighbors` nearest points in `df_sample`.  A spatial lag
    is calculated for each field in `value_fields` as the weighted mean of the
    neighbor values using a Gaussian kernel with bandwidth equal to the mean
    neighbor distance (σ) for each prediction point.  Missing or zero distances
    are handled to avoid division by zero.  Optionally excludes the point itself
    when computing its own lag.

    Parameters
    ----------
    df_sample : pandas.DataFrame
        DataFrame of sample points containing at least columns 'latitude',
        'longitude', and each field in `value_fields`.  Used to train the
        nearest-neighbor tree and source values for lag computation.
    df_univ : pandas.DataFrame
        DataFrame of prediction points containing 'latitude' and 'longitude'.
        May include additional columns; output will append lag columns to this.
    value_fields : list of str
        List of column names in `df_sample` whose spatial lags will be computed.
    neighbors : int, default 5
        Number of nearest neighbors to query for each prediction point.  Must be
        at least 2 to allow exclusion of self when `exclude_self_in_sample=True`.
    exclude_self_in_sample : bool, default False
        If True, the nearest neighbor at distance zero (self) is excluded from
        the lag calculation by dropping the first neighbor in the query results.

    Returns
    -------
    pandas.DataFrame
        A copy of `df_univ` with new columns named 'spatial_lag_<field>' for each
        requested field.  Missing lag values are filled with the median of the
        corresponding field in `df_sample`.

    Raises
    ------
    ValueError
        If `neighbors < 2`, since at least two neighbors are required to compute
        a spatial lag (especially when excluding the self-distance).

    Notes
    -----
    - Uses SciPy’s cKDTree for efficient nearest-neighbor lookup.
    - Gaussian kernel weights are computed as:
      ```
      exp(–(d_ij²) / (2 · σ_i²))
      ```
      , where ``d_ij`` is the distance from point ``i`` to neighbor ``j``,
      and ``σ_i`` is the mean of its ``k`` neighbor distances.

    - Weights are then normalized so that they sum to 1 for each prediction point.


    """
    df = df_univ.copy()

    # Build a cKDTree from df_sales coordinates

    # we TRAIN on these coordinates -- coordinates that are NOT in the test set
    coords_train = df_sample[["latitude", "longitude"]].values
    tree = cKDTree(coords_train)

    # we PREDICT on these coordinates -- all the coordinates in the universe
    coords_all = df[["latitude", "longitude"]].values

    for value_field in value_fields:
        if value_field not in df_sample:
            print("Value field not in df_sample, skipping")
            continue

        # Choose the number of nearest neighbors to use
        k = neighbors  # You can adjust this number as needed

        # Query the tree: for each parcel in df_universe, find the k nearest parcels
        # distances: shape (n_universe, k); indices: corresponding indices in df_sales
        distances, indices = tree.query(coords_all, k=k)

        if exclude_self_in_sample:
            distances = distances[:, 1:]  # Exclude self-distance
            indices = indices[:, 1:]  # Exclude self-index

        # Ensure that distances and indices are 2D arrays (if k==1, reshape them)
        if k < 2:
            raise ValueError("k must be at least 2 to compute spatial lag.")

        # For each universe parcel, compute sigma as the mean distance to its k neighbors.
        sigma = distances.mean(axis=1, keepdims=True)

        # Handle zeros in sigma
        sigma[sigma == 0] = np.finfo(float).eps  # Avoid division by zero

        # Compute Gaussian kernel weights for all neighbors
        weights = np.exp(-(distances**2) / (2 * sigma**2))

        # Normalize the weights so that they sum to 1 for each parcel
        weights_norm = weights / weights.sum(axis=1, keepdims=True)

        # Get the values corresponding to the neighbor indices
        parcel_values = df_sample[value_field].values
        neighbor_values = parcel_values[indices]  # shape (n_universe, k)

        # Compute the weighted average (spatial lag) for each parcel in the universe
        spatial_lag = (np.asarray(weights_norm) * np.asarray(neighbor_values)).sum(
            axis=1
        )

        # Add the spatial lag as a new column
        df[f"spatial_lag_{value_field}"] = spatial_lag

        median_value = df_sample[value_field].median()
        df[f"spatial_lag_{value_field}"] = df[f"spatial_lag_{value_field}"].fillna(
            median_value
        )

    return df

clean_column_names

clean_column_names(df)

Clean the column names in a DataFrame by replacing forbidden characters with legal representations. For one-hot encoded columns (containing '='), ensures clean formatting.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame

required

Returns:

Type Description
DataFrame

DataFrame with cleaned column names

Source code in openavmkit/utilities/data.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def clean_column_names(df: pd.DataFrame):
    """Clean the column names in a DataFrame by replacing forbidden characters with legal
    representations. For one-hot encoded columns (containing '='), ensures clean formatting.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame

    Returns
    -------
    pd.DataFrame
        DataFrame with cleaned column names
    """
    # Find column names that contain forbidden characters and replace them with legal representations.
    replace_map = {
        "[": "_",
        "]": "_",
        "<NA>": "_NA_",
        "/": "_",
        "\\": "_",
        ":": "_",
        "*": "_",
        "?": "_",
        '"': "_",
        "<": "_",
        ">": "_",
        "|": "_",
        " ": "_",  # Replace spaces with underscores
        "-": "_",  # Replace hyphens with underscores
        ",": "_",  # Replace commas with underscores
        ";": "_",  # Replace semicolons with underscores
        ".": "_",  # Replace periods with underscores
        "(": "_",  # Replace parentheses with underscores
        ")": "_",
    }

    # First pass - replace special characters
    for key in replace_map:
        df.columns = df.columns.str.replace(key, replace_map[key], regex=False)

    # Second pass - clean up one-hot encoded column names
    new_columns = []
    for col in df.columns:
        if "=" in col:
            # Handle one-hot encoded columns
            base, value = col.split("=", 1)
            # Clean up the base and value
            base = base.strip()
            value = value.strip()
            # Replace multiple underscores with single underscore
            base = "_".join(filter(None, base.split("_")))
            value = "_".join(filter(None, value.split("_")))
            new_col = f"{base}__{value}"  # Use double underscore as separator
        else:
            # For non-one-hot columns, just clean up multiple underscores
            new_col = "_".join(filter(None, col.split("_")))

        new_columns.append(new_col)

    df.columns = new_columns
    return df

clean_series

clean_series(series)

Clean the values in a Series by replacing forbidden characters with legal representations.

Parameters:

Name Type Description Default
series Series

The series to be cleaned

required

Returns:

Type Description
Series

The cleaned series

Source code in openavmkit/utilities/data.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def clean_series(series: pd.Series):
    """Clean the values in a Series by replacing forbidden characters with legal representations.

    Parameters
    ----------
    series : pd.Series
        The series to be cleaned

    Returns
    -------
    pd.Series
        The cleaned series
    """
    replace_map = {
        "[": "_LBRKT_",
        "]": "_RBRKT_",
        "<NA>": "_NA_",
        "/": "_SLASH_",
        "\\": "_BSLASH_",
        ":": "_COLON_",
        "*": "_STAR_",
        "?": "_QMARK_",
        '"': "_DQUOT_",
        "<": "_LT_",
        ">": "_GT_",
        "|": "_PIPE_",
    }

    for key in replace_map:
        series = series.str.replace(key, replace_map[key], regex=False)

    return series

combine_dfs

combine_dfs(df1, df2, df2_stomps=False, index='key')

Combine two DataFrames on a given index column.

If df2_stomps is False, NA values in df1 are filled with values from df2. If df2_stomps is True, values in df1 are overwritten by those in df2 for matching keys.

Parameters:

Name Type Description Default
df1 DataFrame

First DataFrame

required
df2 DataFrame

Second DataFrame

required
df2_stomps bool

Flag indicating if df2 values should overwrite df1 values (default is False).

False
index str

Column name to use as the index for alignment (default is "key").

'key'

Returns:

Type Description
DataFrame

Combined DataFrame

Source code in openavmkit/utilities/data.py
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def combine_dfs(
    df1: pd.DataFrame, df2: pd.DataFrame, df2_stomps=False, index: str = "key"
) -> pd.DataFrame:
    """Combine two DataFrames on a given index column.

    If ``df2_stomps`` is False, NA values in df1 are filled with values from df2. If
    ``df2_stomps`` is True, values in df1 are overwritten by those in df2 for matching keys.

    Parameters
    ----------
    df1 : pd.DataFrame
        First DataFrame
    df2 : pd.DataFrame
        Second DataFrame
    df2_stomps : bool, optional
        Flag indicating if df2 values should overwrite df1 values (default is False).
    index : str, optional
        Column name to use as the index for alignment (default is "key").

    Returns
    -------
    pd.DataFrame
        Combined DataFrame
    """
    df = df1.copy()
    # Save the original index for restoration
    original_index = df.index.copy()

    # Work on a copy so we don't modify df2 outside this function.
    df2 = df2.copy()

    # Set the index to the key column for alignment.
    df.index = df[index]
    df2.index = df2[index]

    # Iterate over columns in df2 (skip the key column).
    for col in df2.columns:
        if col == index:
            continue
        if col in df.columns:
            # Find the common keys to avoid KeyErrors if df2 has extra keys.
            common_idx = df.index.intersection(df2.index)
            if df2_stomps:
                # Overwrite all values in df for common keys.
                df.loc[common_idx, col] = df2.loc[common_idx, col]
            else:
                # For common keys, fill only NA values.
                na_mask = pd.isna(df.loc[common_idx, col])
                # Only assign where df2 has a value and df is NA.
                df.loc[common_idx[na_mask], col] = df2.loc[common_idx[na_mask], col]
        else:
            # Add the new column, aligning by index.
            # (Rows in df without a corresponding key in df2 will get NaN.)
            df[col] = df2[col]

    # Restore the original index.
    df.index = original_index
    return df

count_values_in_common

count_values_in_common(a, b, a_field, b_field=None)

Count all the unique values that two columns of two dataframes have in common

Parameters:

Name Type Description Default
a DataFrame

The first DataFrame

required
b DataFrame

The second DataFrame

required
a_field str

The column from the first DataFrame

required
b_field str

The column from the second DataFrame

None

Returns:

Type Description
Tuple[int, int]
  • a in b: The number of a's unique values that are also found in b
  • b in a: The number of b's unique values that are also found in a
Source code in openavmkit/utilities/data.py
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
def count_values_in_common(
    a: pd.DataFrame, b: pd.DataFrame, a_field: str, b_field: str = None
):
    """Count all the unique values that two columns of two dataframes have in common

    Parameters
    ----------
    a : pd.DataFrame
        The first DataFrame
    b : pd.DataFrame
        The second DataFrame
    a_field : str
        The column from the first DataFrame
    b_field : str, optional
        The column from the second DataFrame

    Returns
    -------
    Tuple[int, int]

        - a in b: The number of a's unique values that are also found in b
        - b in a: The number of b's unique values that are also found in a
    """
    if b_field is None:
        b_field = a_field
    a_values = set(a[a_field].dropna().unique())
    b_values = set(b[b_field].dropna().unique())
    a_in_b = a_values.intersection(b_values)
    b_in_a = b_values.intersection(a_values)
    return len(a_in_b), len(b_in_a)

df_to_markdown

df_to_markdown(df)

Convert a DataFrame to a markdown-formatted string.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame

required

Returns:

Type Description
str

Markdown representation of the DataFrame

Source code in openavmkit/utilities/data.py
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def df_to_markdown(df: pd.DataFrame):
    """Convert a DataFrame to a markdown-formatted string.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame

    Returns
    -------
    str
        Markdown representation of the DataFrame
    """
    header = "| " + " | ".join(df.columns) + " |"
    separator = "| " + " | ".join(["---"] * len(df.columns)) + " |"
    rows = "\n".join("| " + " | ".join(row) + " |" for row in df.astype(str).values)
    return f"{header}\n{separator}\n{rows}"

div_df_z_safe

div_df_z_safe(df, numerator, denominator)

Perform a divide-by-zero-safe division of two columns in a DataFrame, replacing division by zero with None.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame

required
numerator str

Name of the column to use as the numerator

required
denominator str

Name of the column to use as the numerator/divisor

required

Returns:

Type Description
Series

The result of the division with divide-by-zero cases replaced by None

Source code in openavmkit/utilities/data.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def div_df_z_safe(df: pd.DataFrame, numerator: str, denominator: str):
    """Perform a divide-by-zero-safe division of two columns in a DataFrame, replacing
    division by zero with None.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame
    numerator : str
        Name of the column to use as the numerator
    denominator : str
        Name of the column to use as the numerator/divisor

    Returns
    -------
    pd.Series
        The result of the division with divide-by-zero cases replaced by ``None``
    """
    # Get the index of all rows where the denominator is zero.
    idx_denominator_zero = df[denominator].eq(0)

    # Get the numerator and denominator for rows where the denominator is not zero.
    series_numerator = df.loc[~idx_denominator_zero, numerator]
    series_denominator = df.loc[~idx_denominator_zero, denominator]

    # Make a copy of the denominator.
    result = df[denominator].copy()

    # Replace values where denominator is zero with None.
    result[idx_denominator_zero] = None

    # Replace other values with the result of the division.

    result = result.astype("Float64")  # ensure it can accept the result

    result[~idx_denominator_zero] = series_numerator / series_denominator
    return result

div_series_z_safe

div_series_z_safe(numerator, denominator)

Perform a divide-by-zero-safe division of two series or arrays, replacing division by zero with None.

Parameters:

Name Type Description Default
numerator Series | ndarray

The series/array that serves as the numerator

required
denominator Series | ndarray

The series/array that serves as the denominator/divisor

required

Returns:

Type Description
Series | ndarray

The result of the division with divide-by-zero cases replaced by None

Source code in openavmkit/utilities/data.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def div_series_z_safe(
    numerator: pd.Series | np.ndarray, denominator: pd.Series | np.ndarray
):
    """Perform a divide-by-zero-safe division of two series or arrays, replacing division
    by zero with None.

    Parameters
    ----------
    numerator : pd.Series | np.ndarray
        The series/array that serves as the numerator
    denominator : pd.Series | np.ndarray
        The series/array that serves as the denominator/divisor

    Returns
    -------
    pd.Series | np.ndarray
        The result of the division with divide-by-zero cases replaced by ``None``
    """
    # fast path for ndarray
    if isinstance(numerator, np.ndarray) or isinstance(denominator, np.ndarray):
        num = np.asarray(numerator, dtype=np.float64, order='K')
        den = np.asarray(denominator, dtype=np.float64, order='K')

        # pre‑allocate the output filled with NaN
        out = np.full_like(num, np.nan, dtype=np.float64)

        # element‑wise division only where the denominator is non‑zero
        # np.divide writes directly into `out`
        np.divide(num, den, out=out, where=den != 0)

        return out
    # ---------- pandas path -------------------------------------------------
    num = pd.Series(numerator, copy=False)
    den = pd.Series(denominator, copy=False)

    idx_zero = den == 0
    result = num.div(den).astype("Float64")
    result[idx_zero] = pd.NA
    return result

do_per_model_group

do_per_model_group(df_in, settings, func, params, key='key', verbose=False, instructions=None, skip=None)

Apply a function to each subset of the DataFrame grouped by model_group, updating rows based on matching indices.

Parameters:

Name Type Description Default
df_in DataFrame

Input DataFrame

required
settings dict

Settings dictionary

required
func callable

Function to apply to each subset

required
params dict

Additional parameters for the function

required
key str

Column name to use as the index for alignment (default is "key")

'key'
verbose bool

Whether to print verbose output. Default is False.

False
instructions Any

Special instructions for the function

None
skip list

List of model group names to skip

None

Returns:

Type Description
DataFrame

Modified DataFrame with updates from the function.

Source code in openavmkit/utilities/data.py
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
def do_per_model_group(
    df_in: pd.DataFrame,
    settings: dict,
    func: callable,
    params: dict,
    key: str = "key",
    verbose: bool = False,
    instructions=None,
    skip:list|None=None
) -> pd.DataFrame:
    """Apply a function to each subset of the DataFrame grouped by ``model_group``, updating
    rows based on matching indices.

    Parameters
    ----------
    df_in : pd.DataFrame
        Input DataFrame
    settings : dict
        Settings dictionary
    func : callable
        Function to apply to each subset
    params : dict
        Additional parameters for the function
    key : str, optional
        Column name to use as the index for alignment (default is "key")
    verbose : bool, optional
        Whether to print verbose output. Default is False.
    instructions : Any, optional
        Special instructions for the function
    skip : list, optional
        List of model group names to skip

    Returns
    -------
    pd.DataFrame
        Modified DataFrame with updates from the function.
    """
    df = df_in.copy()

    if instructions is None:
        instructions = {}

    model_groups = get_model_group_ids(settings, df_in)
    verbose = params.get("verbose", verbose)

    for model_group in model_groups:
        if pd.isna(model_group):
            continue
        if skip is not None and model_group in skip:
            if verbose:
                print(f"Skipping model group: {model_group}")
            continue

        if verbose:
            print(f"Processing model group: {model_group}")

        # Copy params locally to avoid side effects.
        params_local = params.copy()
        params_local["model_group"] = model_group

        # Filter the subset using .loc to avoid SettingWithCopyWarning
        mask = df["model_group"].eq(model_group)
        df_sub = df.loc[mask].copy()

        # Apply the function.
        df_sub_updated = func(df_sub, **params_local)

        if df_sub_updated is not None:
            # Ensure consistent data types between df and the updated subset.
            just_stomp_columns = instructions.get("just_stomp_columns", [])
            if len(just_stomp_columns) > 0:
                for col in just_stomp_columns:
                    if col in df_sub_updated.columns:
                        df.loc[mask, col] = df_sub_updated[col]
            else:
                for col in df_sub_updated.columns:
                    if col == key:
                        continue
                    df = combine_dfs(
                        df, df_sub_updated[[key, col]], df2_stomps=True, index=key
                    )

    return df

ensure_categories

ensure_categories(df, df_other, field)

Harmonize categorical levels between two DataFrames for a specified column.

If both df[field] and df_other[field] are of pandas Categorical dtype, this routine computes the union of their categories (preserving the order from df[field] first, then any additional categories from df_other[field]) and sets both Series to use the combined category list. If either column is not categorical, the DataFrames are returned unchanged.

Parameters:

Name Type Description Default
df DataFrame

Primary DataFrame containing the categorical column to standardize.

required
df_other DataFrame

Secondary DataFrame whose categorical column will be aligned to the same category set.

required
field str

Name of the column in both DataFrames to synchronize categories on.

required

Returns:

Type Description
tuple of pandas.DataFrame

A 2-tuple (df_out, df_other_out) where both DataFrames have their field column set to the same Categorical categories. If the column dtype in either DataFrame is not Categorical, both DataFrames are returned without modification.

Source code in openavmkit/utilities/data.py
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
def ensure_categories(
    df: pd.DataFrame, df_other: pd.DataFrame, field: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Harmonize categorical levels between two DataFrames for a specified column.

    If both `df[field]` and `df_other[field]` are of pandas Categorical dtype,
    this routine computes the union of their categories (preserving the order
    from `df[field]` first, then any additional categories from
    `df_other[field]`) and sets both Series to use the combined category list.
    If either column is not categorical, the DataFrames are returned unchanged.

    Parameters
    ----------
    df : pandas.DataFrame
        Primary DataFrame containing the categorical column to standardize.
    df_other : pandas.DataFrame
        Secondary DataFrame whose categorical column will be aligned to the
        same category set.
    field : str
        Name of the column in both DataFrames to synchronize categories on.

    Returns
    -------
    tuple of pandas.DataFrame
        A 2-tuple `(df_out, df_other_out)` where both DataFrames have their
        `field` column set to the same Categorical categories.  If the column
        dtype in either DataFrame is not Categorical, both DataFrames are
        returned without modification.
    """
    if isinstance(df[field].dtype, pd.CategoricalDtype) and isinstance(
        df_other[field].dtype, pd.CategoricalDtype
    ):

        # union keeps order of appearance in the first operands
        cats = df[field].cat.categories.union(df_other[field].cat.categories)

        # give *both* Series the identical category list
        df[field] = df[field].cat.set_categories(cats)
        df_other[field] = df_other[field].cat.set_categories(cats)

    return df, df_other

load_model_results

load_model_results(model_group, model_name, subset='universe', model_type='main')

Load model prediction results for a specified subset from disk, if available.

The function searches for prediction files under out/models/{model_group}/{model_type}/{model_name} in two formats:

  1. Parquet: Looks for either pred_{subset}.parquet or pred_{model_name}_{subset}.parquet. If found, reads the file, renames column key_x to key (if present), and returns a DataFrame with columns ['key', 'prediction'].

  2. Pickle: If no parquet is found, checks for pred_{subset}.pkl. Loads the pickled object (expected to have attributes df_universe, df_sales, and df_test), selects the DataFrame matching subset, and returns its ['key', 'prediction'] columns.

Parameters:

Name Type Description Default
model_group str

Top-level folder grouping for the model outputs (e.g., experiment or category name).

required
model_name str

Subfolder name identifying the specific model within the group.

required
subset str

Which dataset predictions to load. Must be one of: - 'universe': all parcels - 'sales': parcels with sales - 'test': test split

"universe"
model_type str

Subdirectory under model_group for model variations (e.g., "main", "vacant", "hedonic").

"main"

Returns:

Type Description
DataFrame or None
  • A DataFrame with exactly two columns: 'key' and 'prediction' for the requested subset, if a prediction file was successfully found and read.
  • None if no matching prediction file exists on disk.
Source code in openavmkit/utilities/data.py
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
def load_model_results(
    model_group: str,
    model_name: str,
    subset: str = "universe",
    model_type: str = "main",
):
    """
    Load model prediction results for a specified subset from disk, if available.

    The function searches for prediction files under
    ``out/models/{model_group}/{model_type}/{model_name}`` in two formats:

    1. **Parquet**: Looks for either
       ``pred_{subset}.parquet`` or
       ``pred_{model_name}_{subset}.parquet``.  If found, reads the file,
       renames column ``key_x`` to ``key`` (if present), and returns a
       DataFrame with columns ``['key', 'prediction']``.

    2. **Pickle**: If no parquet is found, checks for
       ``pred_{subset}.pkl``.  Loads the pickled object (expected to have
       attributes ``df_universe``, ``df_sales``, and ``df_test``), selects
       the DataFrame matching ``subset``, and returns its ``['key',
       'prediction']`` columns.

    Parameters
    ----------
    model_group : str
        Top-level folder grouping for the model outputs (e.g., experiment
        or category name).
    model_name : str
        Subfolder name identifying the specific model within the group.
    subset : str, default "universe"
        Which dataset predictions to load.  Must be one of:
        - ``'universe'``: all parcels
        - ``'sales'``: parcels with sales
        - ``'test'``: test split
    model_type : str, default "main"
        Subdirectory under ``model_group`` for model variations
        (e.g., "main", "vacant", "hedonic").

    Returns
    -------
    pandas.DataFrame or None
        - A DataFrame with exactly two columns: ``'key'`` and
          ``'prediction'`` for the requested subset, if a prediction file
          was successfully found and read.
        - ``None`` if no matching prediction file exists on disk.
    """
    outpath = f"out/models/{model_group}/{model_type}"

    filepath = f"{outpath}/{model_name}"
    if os.path.exists(filepath):
        fpred = f"{filepath}/pred_{subset}.parquet"
        if not os.path.exists(fpred):
            fpred = f"{filepath}/pred_{model_name}_{subset}.parquet"

        if os.path.exists(fpred):
            df = pd.read_parquet(fpred)
            if "key_x" in df:
                # If the DataFrame has a 'key_x' column, rename it to 'key'
                df.rename(columns={"key_x": "key"}, inplace=True)
            df = df[["key", "prediction"]].copy()
            return df

    fpred_results = f"{filepath}/pred_{subset}.pkl"
    if os.path.exists(fpred_results):
        if model_type != "main":
            with open(fpred_results, "rb") as file:
                results = pickle.load(file)
                if subset == "universe":
                    df = results.df_universe[["key", "prediction"]].copy()
                elif subset == "sales":
                    df = results.df_sales[["key", "prediction"]].copy()
                elif subset == "test":
                    df = results.df_test[["key", "prediction"]].copy()
                return df

    return None

merge_and_stomp_dfs

merge_and_stomp_dfs(df1, df2, df2_stomps=False, on='key', how='left')

Merge two DataFrames and resolve overlapping columns by 'stomping'.

Performs a pandas merge of df1 and df2 on key(s) on, using suffixes '_1' and '_2' for overlapping column names. After merging, for each common column (excluding join keys) the function selects values from df2 wherever non-null if df2_stomps=True, otherwise prefers df1's non-null values. Intermediate suffixed columns are dropped before returning the final DataFrame.

Parameters:

Name Type Description Default
df1 DataFrame

Base DataFrame whose values are used when df2_stomps=False or when df2 has nulls in overlapping columns.

required
df2 DataFrame

Secondary DataFrame whose values may overwrite those in df1 when df2_stomps=True and non-null.

required
df2_stomps bool

If True, prefer non-null values from df2 over df1 in overlapping columns; if False, prefer non-null values from df1.

False
on str or list of str

Column name or list of column names to join on.

'key'
how str

Type of join to perform: 'left', 'right', 'inner', or 'outer'.

'left'

Returns:

Type Description
DataFrame

The merged DataFrame with overlapping columns resolved according to the df2_stomps policy. All original columns and merged non-overlapping columns are retained; intermediate '_1' and '_2' suffix columns are removed.

Source code in openavmkit/utilities/data.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
def merge_and_stomp_dfs(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    df2_stomps=False,
    on: str | list = "key",
    how: str = "left",
) -> pd.DataFrame:
    """
    Merge two DataFrames and resolve overlapping columns by 'stomping'.

    Performs a pandas merge of `df1` and `df2` on key(s) `on`, using suffixes
    '_1' and '_2' for overlapping column names.  After merging, for each
    common column (excluding join keys) the function selects values from
    `df2` wherever non-null if `df2_stomps=True`, otherwise prefers `df1`'s
    non-null values.  Intermediate suffixed columns are dropped before
    returning the final DataFrame.

    Parameters
    ----------
    df1 : pandas.DataFrame
        Base DataFrame whose values are used when `df2_stomps=False` or when
        `df2` has nulls in overlapping columns.
    df2 : pandas.DataFrame
        Secondary DataFrame whose values may overwrite those in `df1`
        when `df2_stomps=True` and non-null.
    df2_stomps : bool, default False
        If True, prefer non-null values from `df2` over `df1` in overlapping
        columns; if False, prefer non-null values from `df1`.
    on : str or list of str, default 'key'
        Column name or list of column names to join on.
    how : str, default 'left'
        Type of join to perform: 'left', 'right', 'inner', or 'outer'.

    Returns
    -------
    pandas.DataFrame
        The merged DataFrame with overlapping columns resolved according to the
        `df2_stomps` policy.  All original columns and merged non-overlapping
        columns are retained; intermediate '_1' and '_2' suffix columns are
        removed.
    """
    common_columns = [col for col in df1.columns if col in df2.columns]
    df_merge = pd.merge(df1, df2, on=on, how=how, suffixes=("_1", "_2"))
    suffixed_columns = [col + "_1" for col in common_columns] + [
        col + "_2" for col in common_columns
    ]
    suffixed_columns = [col for col in suffixed_columns if col in df_merge.columns]

    for col in common_columns:
        if col == on or (isinstance(on, list) and col in on):
            continue
        if df2_stomps:
            # prefer df2's column value everywhere df2 has a non-null value
            # Filter out empty entries before combining
            df2_col = df_merge[col + "_2"].dropna()
            df1_col = df_merge[col + "_1"].dropna()
            if df2_col.size > 0 and df1_col.size > 0:
                df_merge[col] = df2_col.combine_first(df1_col)
            elif df2_col.size > 0:
                df_merge[col] = df2_col
            else:
                df_merge[col] = df1_col
        else:
            # prefer df1's column value everywhere df1 has a non-null value
            s1 = df_merge[f"{col}_1"]
            s2 = df_merge[f"{col}_2"]
            df_merge[col] = _left_wins(s1, s2)

    df_merge.drop(columns=suffixed_columns, inplace=True)
    return df_merge

rename_dict

rename_dict(dict, renames)

Rename the keys of a dictionary according to a given rename map.

Parameters:

Name Type Description Default
dict Dictionary

Original dictionary.

required
renames Dictionary

Diciontary mapping old keys to new keys.

required

Returns:

Type Description
New dictionary with keys renamed
Source code in openavmkit/utilities/data.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
def rename_dict(dict, renames):
    """Rename the keys of a dictionary according to a given rename map.

    Parameters
    ----------
    dict : Dictionary
        Original dictionary.

    renames : Dictionary
        Diciontary mapping old keys to new keys.

    Returns
    -------
    New dictionary with keys renamed
    """
    new_dict = {}
    for key in dict:
        new_key = renames.get(key, key)
        new_dict[new_key] = dict[key]
    return new_dict