`openavmkit.cleaning`

clean_valid_sales

clean_valid_sales(sup, settings)

Clean and validate sales data in the SalesUniversePair.

This function processes the sales data to ensure that only valid sales are retained. It also ensures that the sales data is consistent with the universe data, particularly regarding the vacancy status of parcels. Invalid sales are scrubbed of their metadata, and valid sales are properly classified for ratio studies.

Parameters:

Name	Type	Description	Default
`sup`	`SalesUniversePair`	The SalesUniversePair containing sales and universe data.	required
`settings`	`dict`	The settings dictionary containing configuration for the cleaning process.	required

Returns:

Type	Description
`SalesUniversePair`	The updated SalesUniversePair with cleaned and validated sales data.

Source code in openavmkit/cleaning.py

def clean_valid_sales(sup: SalesUniversePair, settings: dict) -> SalesUniversePair:
    """Clean and validate sales data in the SalesUniversePair.

    This function processes the sales data to ensure that only valid sales are retained.
    It also ensures that the sales data is consistent with the universe data, particularly regarding
    the vacancy status of parcels. Invalid sales are scrubbed of their metadata, and valid sales are
    properly classified for ratio studies.

    Parameters
    ----------
    sup : SalesUniversePair
        The SalesUniversePair containing sales and universe data.
    settings : dict
        The settings dictionary containing configuration for the cleaning process.

    Returns
    -------
    SalesUniversePair
        The updated SalesUniversePair with cleaned and validated sales data.
    """
    # load metadata
    val_date = get_valuation_date(settings)
    val_year = val_date.year
    metadata = settings.get("modeling", {}).get("metadata", {})
    use_sales_from = metadata.get("use_sales_from", {})

    if isinstance(use_sales_from, int):
        use_sales_from_impr = use_sales_from
        use_sales_from_vacant = use_sales_from
    else:
        use_sales_from_impr = use_sales_from.get("improved", val_year - 5)
        use_sales_from_vacant = use_sales_from.get("vacant", val_year - 5)

    df_sales = sup["sales"].copy()
    df_univ = sup["universe"]

    # temporarily merge in universe's vacancy status (how the parcel is now)
    df_univ_vacant = (
        df_univ[["key", "is_vacant"]]
        .copy()
        .rename(columns={"is_vacant": "univ_is_vacant"})
    )

    # check df_univ for duplicate keys:
    if len(df_univ["key"].unique()) != len(df_univ):
        print("WARNING: df_univ has duplicate keys, this will cause problems")
        # print how many:
        dupe_key_count = len(df_univ) - len(df_univ["key"].unique())
        print(f"--> {dupe_key_count} rows with duplicate keys found")

    print(f"Before univ merge len = {len(df_sales)}")

    df_sales = df_sales.merge(df_univ_vacant, on="key", how="left")

    print(f"After univ merge len = {len(df_sales)}")

    oldest_sale_threshold = min(use_sales_from_impr, use_sales_from_vacant)

    # mark which sales are to be used (only those that are valid and within the specified time frame)
    df_sales.loc[
        df_sales["sale_year"].lt(oldest_sale_threshold)
        & df_sales["vacant_sale"].eq(False),
        "valid_sale",
    ] = False
    df_sales.loc[
        df_sales["sale_year"].lt(oldest_sale_threshold)
        & df_sales["vacant_sale"].eq(True),
        "valid_sale",
    ] = False

    # sale prices of 0 and negative and null are invalid
    df_sales.loc[
        df_sales["sale_price"].isna() | df_sales["sale_price"].le(0), "valid_sale"
    ] = False

    # scrub sales info from invalid sales
    idx_invalid = df_sales["valid_sale"].eq(False)
    fields_to_scrub = [
        "sale_date",
        "sale_price",
        "sale_year",
        "sale_month",
        "sale_day",
        "sale_quarter",
        "sale_year_quarter",
        "sale_year_month",
        "sale_age_days",
        "sale_price_per_land_sqft",
        "sale_price_per_impr_sqft",
        "sale_price_time_adj",
        "sale_price_time_adj_per_land_sqft",
        "sale_price_time_adj_per_impr_sqft",
    ]

    for field in fields_to_scrub:
        if field in df_sales:
            df_sales.loc[idx_invalid, field] = None

    # drop all invalid sales:
    df_sales = df_sales[df_sales["valid_sale"].eq(True)].copy()

    # initialize these -- we want to further determine which valid sales are valid for ratio studies
    df_sales["valid_for_ratio_study"] = False
    df_sales["valid_for_land_ratio_study"] = False

    # NORMAL RATIO STUDIES:
    # If it's a valid sale, and its vacancy status matches its status at time of sale, it's valid for a ratio study
    # This is because how it looked at time of sale matches how it looks now, so the prediction is comparable to the sale
    # If the vacancy status has changed since it sold, we can't meaningfully compare sale price to current valuation
    df_sales.loc[
        df_sales["valid_sale"] & df_sales["vacant_sale"].eq(df_sales["univ_is_vacant"]),
        "valid_for_ratio_study",
    ] = True

    # LAND RATIO STUDIES:
    # If it's a valid sale, and it was vacant at time of sale, it's valid for a LAND ratio study regardless of whether it
    # is valid for a normal ratio study. That's because we will come up with a land value prediction no matter what, and
    # we can always compare that to what it sold for, as long as it was vacant at time of sale
    # we can always compare that to what it sold for, as long as it was vacant at time of sale
    df_sales.loc[
        df_sales["valid_sale"] & df_sales["vacant_sale"].eq(True),
        "valid_for_land_ratio_study",
    ] = True

    print(f"Using {len(df_sales[df_sales['valid_sale'].eq(True)])} sales...")
    print(f"--> {len(df_sales[df_sales['vacant_sale'].eq(True)])} vacant sales")
    print(f"--> {len(df_sales[df_sales['vacant_sale'].eq(False)])} improved sales")
    print(
        f"--> {len(df_sales[df_sales['valid_for_ratio_study'].eq(True)])} valid for ratio study"
    )
    print(
        f"--> {len(df_sales[df_sales['valid_for_land_ratio_study'].eq(True)])} valid for land ratio study"
    )

    # We need to ensure that the flag "is_vacant" is valid to train on
    # So in sales it needs to reflect the sale's vacant status
    # When hydrating, this will stomp the universe's vacant status, which is exactly what we want in a training set
    # Meanwhile, during prediction, it will infer based on the universe's vacant status
    df_sales["is_vacant"] = df_sales["vacant_sale"]

    df_sales = df_sales.drop(columns=["univ_is_vacant"])

    # enforce some booleans:
    bool_fields = [
        "valid_sale",
        "vacant_sale",
        "valid_for_ratio_study",
        "valid_for_land_ratio_study",
    ]
    for b in bool_fields:
        if b in df_sales:
            dtype = df_sales[b].dtype
            if dtype != bool:
                if _is_series_all_bools(df_sales[b]):
                    df_sales[b] = df_sales[b].astype(bool)
                else:
                    raise ValueError(
                        f"Field '{b}' contains non-boolean values that cannot be coerced to boolean. Unique values = {df_sales[b].unique()}"
                    )

    sup.update_sales(df_sales, allow_remove_rows=True)

    return sup

fill_unknown_values_sup

fill_unknown_values_sup(sup, settings)

Fill unknown values with default values as specified in settings.

Parameters:

Name	Type	Description	Default
`sup`	`SalesUniversePair`	The SalesUniversePair containing sales and universe data.	required
`settings`	`dict`	The settings dictionary containing configuration for filling unknown values.	required

Returns:

Type	Description
`SalesUniversePair`	The updated SalesUniversePair with filled unknown values.

Source code in openavmkit/cleaning.py

def fill_unknown_values_sup(
    sup: SalesUniversePair, settings: dict
) -> SalesUniversePair:
    """Fill unknown values with default values as specified in settings.

    Parameters
    ----------
    sup : SalesUniversePair
        The SalesUniversePair containing sales and universe data.
    settings : dict
        The settings dictionary containing configuration for filling unknown values.

    Returns
    -------
    SalesUniversePair
        The updated SalesUniversePair with filled unknown values.
    """
    df_sales = sup["sales"].copy()
    df_univ = sup["universe"].copy()

    # Fill ALL unknown values for the universe
    df_univ = _fill_unknown_values_per_model_group(df_univ, settings)

    # For sales, fill ONLY the unknown values that pertain to sales metadata
    # df_sales can contain characteristics, but we want to preserve the blanks in those fields because they function
    # as overlays on top of the universe data
    dd = get_data_dictionary(settings)
    sale_fields = get_grouped_fields_from_data_dictionary(dd, "sale")
    sale_fields = [field for field in sale_fields if field in df_sales]

    df_sales_subset = df_sales[sale_fields].copy()
    df_sales_subset = _fill_unknown_values(df_sales_subset, settings)
    for col in df_sales_subset:
        df_sales[col] = df_sales_subset[col]

    sup.set("sales", df_sales)
    sup.set("universe", df_univ)

    return sup

validate_arms_length_sales

validate_arms_length_sales(sup, settings, verbose=False)

Validate arms-length sales using based on configurable filter conditions.

Parameters:

Name	Type	Description	Default
`sup`	`SalesUniversePair`	The SalesUniversePair containing sales and universe data.	required
`settings`	`dict`	The settings dictionary containing configuration for arms-length validation.	required
`verbose`	`bool`	If True, prints detailed information about the validation process. Default is False.	`False`

Returns:

Type	Description
`SalesUniversePair`	The updated SalesUniversePair with arms-length validation applied to sales data.

Source code in openavmkit/cleaning.py

def validate_arms_length_sales(
    sup: SalesUniversePair, settings: dict, verbose: bool = False
) -> SalesUniversePair:
    """Validate arms-length sales using based on configurable filter conditions.

    Parameters
    ----------
    sup : SalesUniversePair
        The SalesUniversePair containing sales and universe data.
    settings : dict
        The settings dictionary containing configuration for arms-length validation.
    verbose : bool, optional
        If True, prints detailed information about the validation process. Default is False.

    Returns
    -------
    SalesUniversePair
        The updated SalesUniversePair with arms-length validation applied to sales data.
    """
    s_data = settings.get("data", {})
    s_process = s_data.get("process", {})
    s_validation = s_process.get("arms_length_validation", {})

    if not s_validation.get("enabled", False):
        if verbose:
            print("Arms length validation disabled, skipping...")
        return sup

    # Get sales data
    df_sales = sup["sales"].copy()
    total_sales = len(df_sales)
    excluded_sales = []
    total_excluded = 0

    # Identify sales by filter
    filter_settings = s_validation.get("filter", {})
    if filter_settings.get("enabled", False):
        if verbose:
            print("\nApplying filter method...")

        # Get filter conditions from settings
        filter_conditions = filter_settings.get("conditions", [])
        if not filter_conditions:
            raise ValueError("No filter conditions defined in settings")

        # Resolve filter using standard filter resolution
        filter_mask = resolve_filter(df_sales, filter_conditions)

        # Get keys of filtered sales
        filtered_keys = df_sales[filter_mask]["key_sale"].tolist()
        if filtered_keys:
            excluded_info = {
                "method": "filter",
                "key_sales": filtered_keys,
                "total_sales": total_sales,
                "excluded": len(filtered_keys),
                "conditions": filter_conditions,
            }
            excluded_sales.append(excluded_info)
            total_excluded += len(filtered_keys)

            # Mark these sales as invalid
            df_sales.loc[df_sales["key_sale"].isin(filtered_keys), "valid_sale"] = False

            if verbose:
                print(f"--> Found {len(filtered_keys)} sales excluded by filter method")

    if verbose:
        print(f"\nOverall summary:")
        print(f"--> Total sales processed: {total_sales}")
        print(
            f"--> Total sales excluded: {total_excluded} ({total_excluded/total_sales*100:.1f}%)"
        )

    # Cache the excluded sales info
    if excluded_sales:
        cache_data = {
            "excluded_sales": excluded_sales,
            "total_sales": total_sales,
            "total_excluded": total_excluded,
            "settings": s_validation,
        }
        write_cache("arms_length_validation", cache_data, cache_data, "dict")

    # Filter out invalid sales and update the SalesUniversePair
    df_sales = df_sales[df_sales["valid_sale"].eq(True)].copy()
    sup.set("sales", df_sales)
    return sup