Skip to content

openavmkit.cleaning

clean_valid_sales

clean_valid_sales(sup, settings)

Clean and validate sales data in the SalesUniversePair.

This function processes the sales data to ensure that only valid sales are retained. It also ensures that the sales data is consistent with the universe data, particularly regarding the vacancy status of parcels. Invalid sales are scrubbed of their metadata, and valid sales are properly classified for ratio studies.

Parameters:

Name Type Description Default
sup SalesUniversePair

The SalesUniversePair containing sales and universe data.

required
settings dict

The settings dictionary containing configuration for the cleaning process.

required

Returns:

Type Description
SalesUniversePair

The updated SalesUniversePair with cleaned and validated sales data.

Source code in openavmkit/cleaning.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def clean_valid_sales(sup: SalesUniversePair, settings: dict) -> SalesUniversePair:
    """Clean and validate sales data in the SalesUniversePair.

    This function processes the sales data to ensure that only valid sales are retained.
    It also ensures that the sales data is consistent with the universe data, particularly regarding
    the vacancy status of parcels. Invalid sales are scrubbed of their metadata, and valid sales are
    properly classified for ratio studies.

    Parameters
    ----------
    sup : SalesUniversePair
        The SalesUniversePair containing sales and universe data.
    settings : dict
        The settings dictionary containing configuration for the cleaning process.

    Returns
    -------
    SalesUniversePair
        The updated SalesUniversePair with cleaned and validated sales data.
    """
    # load metadata
    val_date = get_valuation_date(settings)
    val_year = val_date.year
    metadata = settings.get("modeling", {}).get("metadata", {})
    use_sales_from = metadata.get("use_sales_from", {})

    if isinstance(use_sales_from, int):
        use_sales_from_impr = use_sales_from
        use_sales_from_vacant = use_sales_from
    else:
        use_sales_from_impr = use_sales_from.get("improved", val_year - 5)
        use_sales_from_vacant = use_sales_from.get("vacant", val_year - 5)

    df_sales = sup["sales"].copy()
    df_univ = sup["universe"]

    # temporarily merge in universe's vacancy status (how the parcel is now)
    df_univ_vacant = (
        df_univ[["key", "is_vacant"]]
        .copy()
        .rename(columns={"is_vacant": "univ_is_vacant"})
    )

    # check df_univ for duplicate keys:
    if len(df_univ["key"].unique()) != len(df_univ):
        print("WARNING: df_univ has duplicate keys, this will cause problems")
        # print how many:
        dupe_key_count = len(df_univ) - len(df_univ["key"].unique())
        print(f"--> {dupe_key_count} rows with duplicate keys found")

    print(f"Before univ merge len = {len(df_sales)}")

    df_sales = df_sales.merge(df_univ_vacant, on="key", how="left")

    print(f"After univ merge len = {len(df_sales)}")

    oldest_sale_threshold = min(use_sales_from_impr, use_sales_from_vacant)

    # mark which sales are to be used (only those that are valid and within the specified time frame)
    df_sales.loc[
        df_sales["sale_year"].lt(oldest_sale_threshold)
        & df_sales["vacant_sale"].eq(False),
        "valid_sale",
    ] = False
    df_sales.loc[
        df_sales["sale_year"].lt(oldest_sale_threshold)
        & df_sales["vacant_sale"].eq(True),
        "valid_sale",
    ] = False

    # sale prices of 0 and negative and null are invalid
    df_sales.loc[
        df_sales["sale_price"].isna() | df_sales["sale_price"].le(0), "valid_sale"
    ] = False

    # scrub sales info from invalid sales
    idx_invalid = df_sales["valid_sale"].eq(False)
    fields_to_scrub = [
        "sale_date",
        "sale_price",
        "sale_year",
        "sale_month",
        "sale_day",
        "sale_quarter",
        "sale_year_quarter",
        "sale_year_month",
        "sale_age_days",
        "sale_price_per_land_sqft",
        "sale_price_per_impr_sqft",
        "sale_price_time_adj",
        "sale_price_time_adj_per_land_sqft",
        "sale_price_time_adj_per_impr_sqft",
    ]

    for field in fields_to_scrub:
        if field in df_sales:
            df_sales.loc[idx_invalid, field] = None

    # drop all invalid sales:
    df_sales = df_sales[df_sales["valid_sale"].eq(True)].copy()

    # initialize these -- we want to further determine which valid sales are valid for ratio studies
    df_sales["valid_for_ratio_study"] = False
    df_sales["valid_for_land_ratio_study"] = False

    # NORMAL RATIO STUDIES:
    # If it's a valid sale, and its vacancy status matches its status at time of sale, it's valid for a ratio study
    # This is because how it looked at time of sale matches how it looks now, so the prediction is comparable to the sale
    # If the vacancy status has changed since it sold, we can't meaningfully compare sale price to current valuation
    df_sales.loc[
        df_sales["valid_sale"] & df_sales["vacant_sale"].eq(df_sales["univ_is_vacant"]),
        "valid_for_ratio_study",
    ] = True

    # LAND RATIO STUDIES:
    # If it's a valid sale, and it was vacant at time of sale, it's valid for a LAND ratio study regardless of whether it
    # is valid for a normal ratio study. That's because we will come up with a land value prediction no matter what, and
    # we can always compare that to what it sold for, as long as it was vacant at time of sale
    # we can always compare that to what it sold for, as long as it was vacant at time of sale
    df_sales.loc[
        df_sales["valid_sale"] & df_sales["vacant_sale"].eq(True),
        "valid_for_land_ratio_study",
    ] = True

    print(f"Using {len(df_sales[df_sales['valid_sale'].eq(True)])} sales...")
    print(f"--> {len(df_sales[df_sales['vacant_sale'].eq(True)])} vacant sales")
    print(f"--> {len(df_sales[df_sales['vacant_sale'].eq(False)])} improved sales")
    print(
        f"--> {len(df_sales[df_sales['valid_for_ratio_study'].eq(True)])} valid for ratio study"
    )
    print(
        f"--> {len(df_sales[df_sales['valid_for_land_ratio_study'].eq(True)])} valid for land ratio study"
    )

    # We need to ensure that the flag "is_vacant" is valid to train on
    # So in sales it needs to reflect the sale's vacant status
    # When hydrating, this will stomp the universe's vacant status, which is exactly what we want in a training set
    # Meanwhile, during prediction, it will infer based on the universe's vacant status
    df_sales["is_vacant"] = df_sales["vacant_sale"]

    df_sales = df_sales.drop(columns=["univ_is_vacant"])

    # enforce some booleans:
    bool_fields = [
        "valid_sale",
        "vacant_sale",
        "valid_for_ratio_study",
        "valid_for_land_ratio_study",
    ]
    for b in bool_fields:
        if b in df_sales:
            dtype = df_sales[b].dtype
            if dtype != bool:
                if _is_series_all_bools(df_sales[b]):
                    df_sales[b] = df_sales[b].astype(bool)
                else:
                    raise ValueError(
                        f"Field '{b}' contains non-boolean values that cannot be coerced to boolean. Unique values = {df_sales[b].unique()}"
                    )

    sup.update_sales(df_sales, allow_remove_rows=True)

    return sup

fill_unknown_values_sup

fill_unknown_values_sup(sup, settings)

Fill unknown values with default values as specified in settings.

Parameters:

Name Type Description Default
sup SalesUniversePair

The SalesUniversePair containing sales and universe data.

required
settings dict

The settings dictionary containing configuration for filling unknown values.

required

Returns:

Type Description
SalesUniversePair

The updated SalesUniversePair with filled unknown values.

Source code in openavmkit/cleaning.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
def fill_unknown_values_sup(
    sup: SalesUniversePair, settings: dict
) -> SalesUniversePair:
    """Fill unknown values with default values as specified in settings.

    Parameters
    ----------
    sup : SalesUniversePair
        The SalesUniversePair containing sales and universe data.
    settings : dict
        The settings dictionary containing configuration for filling unknown values.

    Returns
    -------
    SalesUniversePair
        The updated SalesUniversePair with filled unknown values.
    """
    df_sales = sup["sales"].copy()
    df_univ = sup["universe"].copy()

    # Fill ALL unknown values for the universe
    df_univ = _fill_unknown_values_per_model_group(df_univ, settings)

    # For sales, fill ONLY the unknown values that pertain to sales metadata
    # df_sales can contain characteristics, but we want to preserve the blanks in those fields because they function
    # as overlays on top of the universe data
    dd = get_data_dictionary(settings)
    sale_fields = get_grouped_fields_from_data_dictionary(dd, "sale")
    sale_fields = [field for field in sale_fields if field in df_sales]

    df_sales_subset = df_sales[sale_fields].copy()
    df_sales_subset = _fill_unknown_values(df_sales_subset, settings)
    for col in df_sales_subset:
        df_sales[col] = df_sales_subset[col]

    sup.set("sales", df_sales)
    sup.set("universe", df_univ)

    return sup

validate_arms_length_sales

validate_arms_length_sales(sup, settings, verbose=False)

Validate arms-length sales using based on configurable filter conditions.

Parameters:

Name Type Description Default
sup SalesUniversePair

The SalesUniversePair containing sales and universe data.

required
settings dict

The settings dictionary containing configuration for arms-length validation.

required
verbose bool

If True, prints detailed information about the validation process. Default is False.

False

Returns:

Type Description
SalesUniversePair

The updated SalesUniversePair with arms-length validation applied to sales data.

Source code in openavmkit/cleaning.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def validate_arms_length_sales(
    sup: SalesUniversePair, settings: dict, verbose: bool = False
) -> SalesUniversePair:
    """Validate arms-length sales using based on configurable filter conditions.

    Parameters
    ----------
    sup : SalesUniversePair
        The SalesUniversePair containing sales and universe data.
    settings : dict
        The settings dictionary containing configuration for arms-length validation.
    verbose : bool, optional
        If True, prints detailed information about the validation process. Default is False.

    Returns
    -------
    SalesUniversePair
        The updated SalesUniversePair with arms-length validation applied to sales data.
    """
    s_data = settings.get("data", {})
    s_process = s_data.get("process", {})
    s_validation = s_process.get("arms_length_validation", {})

    if not s_validation.get("enabled", False):
        if verbose:
            print("Arms length validation disabled, skipping...")
        return sup

    # Get sales data
    df_sales = sup["sales"].copy()
    total_sales = len(df_sales)
    excluded_sales = []
    total_excluded = 0

    # Identify sales by filter
    filter_settings = s_validation.get("filter", {})
    if filter_settings.get("enabled", False):
        if verbose:
            print("\nApplying filter method...")

        # Get filter conditions from settings
        filter_conditions = filter_settings.get("conditions", [])
        if not filter_conditions:
            raise ValueError("No filter conditions defined in settings")

        # Resolve filter using standard filter resolution
        filter_mask = resolve_filter(df_sales, filter_conditions)

        # Get keys of filtered sales
        filtered_keys = df_sales[filter_mask]["key_sale"].tolist()
        if filtered_keys:
            excluded_info = {
                "method": "filter",
                "key_sales": filtered_keys,
                "total_sales": total_sales,
                "excluded": len(filtered_keys),
                "conditions": filter_conditions,
            }
            excluded_sales.append(excluded_info)
            total_excluded += len(filtered_keys)

            # Mark these sales as invalid
            df_sales.loc[df_sales["key_sale"].isin(filtered_keys), "valid_sale"] = False

            if verbose:
                print(f"--> Found {len(filtered_keys)} sales excluded by filter method")

    if verbose:
        print(f"\nOverall summary:")
        print(f"--> Total sales processed: {total_sales}")
        print(
            f"--> Total sales excluded: {total_excluded} ({total_excluded/total_sales*100:.1f}%)"
        )

    # Cache the excluded sales info
    if excluded_sales:
        cache_data = {
            "excluded_sales": excluded_sales,
            "total_sales": total_sales,
            "total_excluded": total_excluded,
            "settings": s_validation,
        }
        write_cache("arms_length_validation", cache_data, cache_data, "dict")

    # Filter out invalid sales and update the SalesUniversePair
    df_sales = df_sales[df_sales["valid_sale"].eq(True)].copy()
    sup.set("sales", df_sales)
    return sup