Skip to content

openavmkit.sales_scrutiny_study

SalesScrutinyStudy

SalesScrutinyStudy(df, settings, model_group)

Sales scrutiny study object

This class performs cluster-based analysis on sales to identify anomalies Anomalous sales detected by this method are more likely to be invalid sales This study helps the modeler narrow their focus on which sales should be scrutinized

Attributes:

Name Type Description
df_vacant DataFrame

DataFrame of sales that were allegedly vacant (no building) at time of sale

df_improved DataFrame

DataFrame of sales that were allegedly improved (had building) at time of sale

settings dict

Settings dictionary

model_group str

The model group to investigate

summaries dict[str, SalesScrutinyStudySummary]

Dictionary in which the results are stored

Initialize a SalesScrutinyStudy object

Parameters:

Name Type Description Default
df DataFrame

The data you wish to analyze

required
settings dict

Settings dictionary

required
model_group str

Model group to analyze

required
Source code in openavmkit/sales_scrutiny_study.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def __init__(self, df: pd.DataFrame, settings: dict, model_group: str):
    """Initialize a SalesScrutinyStudy object

    Parameters
    ----------
    df : pd.DataFrame
        The data you wish to analyze
    settings : dict
        Settings dictionary
    model_group : str
        Model group to analyze
    """
    self.model_group = model_group

    df = df[df["model_group"].eq(model_group)]
    df = _get_sales(df, settings)

    df_vacant = get_vacant_sales(df, settings)
    df_improved = get_vacant_sales(df, settings, invert=True)

    stuff = {"i": df_improved, "v": df_vacant}

    sale_field = get_sale_field(settings)
    important_fields = get_important_fields(settings, df)
    location_fields = get_locations(settings, df)
    self.summaries = {
        "i": SalesScrutinyStudySummary(),
        "v": SalesScrutinyStudySummary(),
    }

    for key in stuff:
        df = stuff[key]
        df, cluster_fields = _mark_sales_scrutiny_clusters(df, settings)
        df["ss_id"] = df["model_group"] + "_" + key + "_" + df["ss_id"].astype(str)
        per_sqft = ""
        denominator = ""
        if key == "i":
            per_sqft = "impr_sqft"
            denominator = "bldg_area_finished_sqft"
        elif key == "v":
            per_sqft = "land_sqft"
            denominator = "land_area_sqft"

        sale_field_per = f"{sale_field}_{per_sqft}"
        df[sale_field_per] = div_df_z_safe(df, sale_field, denominator)

        other_fields = cluster_fields + location_fields + important_fields
        other_fields = list(dict.fromkeys(other_fields))
        other_fields += [
            "address",
            "deed_book",
            "deed_page",
            "sale_date",
            "valid_sale",
            "vacant_sale",
        ]

        other_fields = [f for f in other_fields if f in df]

        df_cluster_fields = df[["key_sale"] + other_fields]
        df = _calc_sales_scrutiny(df, sale_field_per)
        df = df.merge(df_cluster_fields, on="key_sale", how="left")

        total_anomalies = 0
        for i in range(1, 6):
            field = f"anomaly_{i}"
            if field in df:
                count_anomaly = len(df[df[field].eq(True)])
            else:
                count_anomaly = 0
            total_anomalies += count_anomaly
            self.summaries[key].num_flagged_sales_by_type[field] = count_anomaly

        if "flagged" in df:
            self.summaries[key].num_sales_flagged = len(df[df["flagged"].eq(True)])
        else:
            self.summaries[key].num_sales_flagged = 0
        self.summaries[key].num_sales_total = len(df)

        stuff[key] = df

    self.df_vacant = stuff["v"]
    self.df_improved = stuff["i"]
    self.settings = settings

get_scrutinized

get_scrutinized(df_in, verbose=False)

Remove flagged sales from the dataset and return the modified dataset

Parameters:

Name Type Description Default
df_in DataFrame

The dataframe you wish to clean

required
verbose bool

Whether to print verbose output. Default False.

False
Source code in openavmkit/sales_scrutiny_study.py
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def get_scrutinized(self, df_in: pd.DataFrame, verbose: bool = False):
    """Remove flagged sales from the dataset and return the modified dataset

    Parameters
    ----------
    df_in : pd.DataFrame
        The dataframe you wish to clean
    verbose : bool, optional
        Whether to print verbose output. Default False.
    """
    df = df_in.copy()

    df_v = self.df_vacant
    df_i = self.df_improved

    keys_flagged = []

    if "flagged" in df_v:
        # remove flagged sales:
        vacant_flags = df_v[df_v["flagged"].eq(True)]["key_sale"].tolist()
        keys_flagged += vacant_flags
        if verbose:
            print(f"--> Flagged {len(vacant_flags)} vacant sales")

    if "flagged" in df_i:
        improved_flags = df_i[df_i["flagged"].eq(True)]["key_sale"].tolist()
        keys_flagged += improved_flags
        if verbose:
            print(f"--> Flagged {len(improved_flags)} improved sales")

    # ensure unique:
    keys_flagged = list(dict.fromkeys(keys_flagged))

    if len(df) > 0:

        num_valid_sales_before = len(df[df["valid_sale"].eq(True)])

        df.loc[df["key_sale"].isin(keys_flagged), "valid_sale"] = False

        num_valid_sales_after = len(df[df["valid_sale"].eq(True)])

        if verbose:
            print(f"--> Unmarked sales before: {num_valid_sales_before}")
            print(f"--> Unmarked sales after: {num_valid_sales_after}")
            diff = num_valid_sales_before - num_valid_sales_after
            print(f"--> Marked {diff} new potentially invalid sales")

        # merge ss_id into df:
        df = combine_dfs(df, df_v[["key_sale", "ss_id"]], index="key_sale")
        df = combine_dfs(df, df_i[["key_sale", "ss_id"]], index="key_sale")

    return df

write

write(path)

Writes the report to disk

Parameters:

Name Type Description Default
path str

The root path

required
Source code in openavmkit/sales_scrutiny_study.py
167
168
169
170
171
172
173
174
175
176
def write(self, path: str):
    """Writes the report to disk

    Parameters
    ----------
    path : str
        The root path
    """
    self._write(path, True)
    self._write(path, False)

SalesScrutinyStudySummary

SalesScrutinyStudySummary()

Summary statistics for a Sales scrutiny study

Attributes:

Name Type Description
num_sales_flagged int

The number of sales flagged by the study

num_sales_total int

The number of sales that were tested

num_flagged_sales_by_type dict[str:int]

Dictionary breaking down number of flagged sales by anomaly type

Source code in openavmkit/sales_scrutiny_study.py
47
48
49
50
def __init__(self):
    self.num_sales_flagged = 0
    self.num_sales_total = 0
    self.num_flagged_sales_by_type = {}

drop_manual_exclusions

drop_manual_exclusions(sup, settings, verbose=False)

Drops sales that the user has individually marked as invalid

Parameters:

Name Type Description Default
sup SalesUniversePair

The data you want to clean

required
settings dict

Settings dictionary

required
verbose bool

Whether to print verbose output. Default is False.

False

Returns:

Type Description
SalesUniversePair

The original data with any modifications

Source code in openavmkit/sales_scrutiny_study.py
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
def drop_manual_exclusions(
    sup: SalesUniversePair, settings: dict, verbose: bool = False
) -> SalesUniversePair:
    """Drops sales that the user has individually marked as invalid

    Parameters
    ----------
    sup : SalesUniversePair
        The data you want to clean
    settings : dict
        Settings dictionary
    verbose : bool, optional
        Whether to print verbose output. Default is False.

    Returns
    -------
    SalesUniversePair
        The original data with any modifications
    """
    invalid_key_file: str | None = (
        settings.get("analysis", {})
        .get("sales_scrutiny", {})
        .get("invalid_key_file", None)
    )
    if invalid_key_file is not None:
        if os.path.exists(invalid_key_file):
            df_invalid_keys = pd.read_csv(invalid_key_file, dtype={"key_sale": str})
            bad_keys = df_invalid_keys["key_sale"].tolist()
        else:
            warnings.warn(
                f"--> Invalid key file {invalid_key_file} does not exist, skipping manual exclusions"
            )
            bad_keys = []
        df_sales = sup.sales.copy()
        len_before = len(df_sales)
        df_sales = df_sales[~df_sales["key_sale"].isin(bad_keys)]
        len_after = len(df_sales)
        num_dropped = len_before - len_after
        if verbose:
            print("")
            print(f"Dropping {num_dropped} manual exclusions from sales scrutiny")
        sup.sales = df_sales
    else:
        if verbose:
            print("")
            print(
                f"No manual exclusions file specified in settings, skipping manual exclusions"
            )
    return sup

mark_ss_ids_per_model_group

mark_ss_ids_per_model_group(df_in, settings, verbose=False)

Cluster parcels for a sales scrutiny study by assigning sales scrutiny IDs.

This function processes each model group within the provided dataset, identifies clusters of parcels for scrutiny, and writes the cluster identifiers into a new field.

Parameters:

Name Type Description Default
df_in DataFrame

The data you want to mark

required
settings dict

Configuration settings.

required
verbose bool

If True, prints verbose output during processing. Defaults to False.

False

Returns:

Type Description
DataFrame

Updated DataFrame with marked sales scrutiny IDs.

Source code in openavmkit/sales_scrutiny_study.py
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
def mark_ss_ids_per_model_group(
    df_in: pd.DataFrame, settings: dict, verbose: bool = False
) -> pd.DataFrame:
    """
    Cluster parcels for a sales scrutiny study by assigning sales scrutiny IDs.

    This function processes each model group within the provided dataset,
    identifies clusters of parcels for scrutiny, and writes the cluster identifiers
    into a new field.

    Parameters
    ----------
    df_in : pd.DataFrame
        The data you want to mark
    settings : dict
        Configuration settings.
    verbose : bool, optional
        If True, prints verbose output during processing. Defaults to False.

    Returns
    -------
    pd.DataFrame
        Updated DataFrame with marked sales scrutiny IDs.
    """
    # Mark the sales scrutiny ID's
    df = do_per_model_group(
        df_in.copy(),
        settings,
        _mark_ss_ids,
        {"settings": settings, "verbose": verbose},
        key="key_sale",
        instructions={"just_stomp_columns": ["ss_id"]},
    )
    return df

run_heuristics

run_heuristics(sup, settings, drop=True, verbose=False)

Identifies and flags anomalous sales by heuristic. Drops them if the user specifies.

Parameters:

Name Type Description Default
sup SalesUniversePair

The data you want to analyze/clean

required
settings dict

Settings dictionary

required
drop bool

If True, drops all sales flagged by this method. Default is True.

True
verbose bool

Whether to print verbose output. Default is False.

False

Returns:

Type Description
SalesUniversePair

The original data with any modifications

Source code in openavmkit/sales_scrutiny_study.py
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
def run_heuristics(
    sup: SalesUniversePair, settings: dict, drop: bool = True, verbose: bool = False
) -> SalesUniversePair:
    """
    Identifies and flags anomalous sales by heuristic. Drops them if the user specifies.

    Parameters
    ----------
    sup : SalesUniversePair
        The data you want to analyze/clean
    settings : dict
        Settings dictionary
    drop : bool, optional
        If True, drops all sales flagged by this method. Default is True.
    verbose : bool, optional
        Whether to print verbose output. Default is False.

    Returns
    -------
    SalesUniversePair
        The original data with any modifications
    """
    ss = settings.get("analysis", {}).get("sales_scrutiny", {})
    deed_id = ss.get("deed_id", "deed_id")

    df_sales = get_hydrated_sales_from_sup(sup)

    #### Multi-parcel sales detection heuristics

    # 1 -- Flag sales with identical deed IDs AND identical sale dates
    df_sales["deed_date"] = df_sales[deed_id].astype(str)+"---"+df_sales["sale_date"].astype(str)
    vcs_deed_date = df_sales["deed_date"].value_counts()
    idx_dupe_deed_dates = vcs_deed_date[vcs_deed_date > 1].index.values
    df_sales.loc[
        df_sales["deed_date"].isin(idx_dupe_deed_dates),
        "flag_dupe_deed_date"
    ] = True
    # drop extraneous column
    df_sales = df_sales.drop(columns="deed_date")

    # 2 -- Flag sales made on the same date for the same price
    df_sales["date_price"] = df_sales["sale_date"].astype(str) + "---" + df_sales["sale_price"].astype(str)
    vcs_date_price = df_sales["date_price"].value_counts()
    idx_dupe_date_price = vcs_date_price[vcs_date_price > 1].index.values
    df_sales.loc[
        df_sales["date_price"].isin(idx_dupe_date_price),
        "flag_dupe_date_price",
    ] = True
    # drop extraneous column
    df_sales = df_sales.drop(columns="date_price")

    #### Misclassified vacant sales detection heuristics

    # 3 -- Flag vacant sales with a building year built older than the sale year
    idx_false_vacant = (
        df_sales["vacant_sale"].eq(1)
        & df_sales["bldg_year_built"].gt(0)
        & df_sales["bldg_year_built"].lt(df_sales["sale_year"])
    )
    df_sales.loc[idx_false_vacant, "flag_false_vacant"] = True

    files = {
        "flag_dupe_deed_date": "duplicated_deeds_and_dates",
        "flag_dupe_date_price": "duplicated_dates_and_prices",
        "flag_false_vacant": "classified_vacant_but_bldg_older_than_sale_year",
    }

    locations = get_locations(settings, df_sales)

    bad_keys = []

    if verbose:
        print(f"Validating sales by heuristic, {len(df_sales)} total sales")

    for key in files:
        if key in df_sales.columns:
            df = df_sales[df_sales[key].eq(True)]
            if len(df) > 0:
                path = f"out/sales_scrutiny/{files[key]}.xlsx"

                cols = (
                    [
                        "key_sale",
                        "sale_date",
                        "sale_price",
                        "sale_price_time_adj",
                        "deed_book",
                        "deed_page",
                        "deed_id",
                        "sale_year",
                        "bldg_year_built",
                        "bldg_area_finished_sqft",
                        "land_area_sqft",
                        "valid_sale",
                        "vacant_sale",
                        "address",
                    ]
                    + locations
                    + [key]
                )

                cols = [col for col in cols if col in df]
                df = df[cols]

                _bad_keys = df["key_sale"].tolist()

                if verbose:
                    print(f"--> {len(_bad_keys)} bad keys for heuristic: {key}")

                bad_keys = list(set(bad_keys + _bad_keys))

                df = df.rename(
                    columns={
                        "key_sale": "Sale key",
                        "sale_date": "Sale date",
                        "sale_price": "Sale price",
                        "sale_price_time_adj": "Sale price\n(Time adj.)",
                        "deed_book": "Deed book",
                        "deed_page": "Deed page",
                        "deed_id": "Deed ID",
                        "sale_year": "Year sold",
                        "bldg_year_built": "Year built",
                        "bldg_area_finished_sqft": "Impr sqft",
                        "land_area_sqft": "Land sqft",
                        "valid_sale": "Valid sale",
                        "vacant_sale": "Vacant sale",
                        "flag_dupe_deed_date": "Repeated\ndeed & sale date",
                        "flag_dupe_date_price": "Repeated\nsale date & price",
                        "flag_false_vacant": "Bldg older\nthan sale year",
                    }
                )

                _curr_0 = {"num_format": "#,##0"}
                _date = {"num_format": "yyyy-mm-dd"}

                columns = {
                    "Sale date": _date,
                    "Sale price": _curr_0,
                    "Sale price\n(Time adj.)": _curr_0,
                }

                column_conditions = {
                    "Repeated\ndeed ID": {
                        "type": "cell",
                        "criteria": "==",
                        "value": "TRUE",
                        "format": {"bold": True, "font_color": "red"},
                    },
                    "Repeated\nsale date & price": {
                        "type": "cell",
                        "criteria": "==",
                        "value": "TRUE",
                        "format": {"bold": True, "font_color": "red"},
                    },
                    "Bldg older\nthan sale year": {
                        "type": "cell",
                        "criteria": "==",
                        "value": "TRUE",
                        "format": {"bold": True, "font_color": "red"},
                    },
                }

                write_to_excel(
                    df,
                    path,
                    {"columns": {"formats": columns, "conditions": column_conditions}},
                )

    if drop:
        print(f"Dropped {len(bad_keys)} invalid sales keys identified by heuristic")
        df_sales = sup.sales.copy()
        df_sales = df_sales[~df_sales["key_sale"].isin(bad_keys)]
        sup.set("sales", df_sales)
    else:
        print(
            f"Identified {len(bad_keys)} invalid sales keys identified by heuristic, but not dropping them"
        )

    _run_land_percentiles(sup, settings)

    return sup

run_sales_scrutiny

run_sales_scrutiny(df_in, settings, model_group, verbose=False)

Run sales scrutiny analysis on an individual model group

Parameters:

Name Type Description Default
df_in DataFrame

The data that you want to analyze

required
settings dict

Configuration settings.

required
model_group str

The model group you want to analyze

required
verbose bool

If True, enables verbose logging. Defaults to False.

False

Returns:

Type Description
DataFrame

Updated DataFrame after sales scrutiny analysis.

Source code in openavmkit/sales_scrutiny_study.py
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
def run_sales_scrutiny(
    df_in: pd.DataFrame, settings: dict, model_group: str, verbose=False
) -> pd.DataFrame:
    """
    Run sales scrutiny analysis on an individual model group

    Parameters
    ----------
    df_in : pd.DataFrame
        The data that you want to analyze
    settings : dict
        Configuration settings.
    model_group : str
        The model group you want to analyze
    verbose : bool, optional
        If True, enables verbose logging. Defaults to False.

    Returns
    -------
    pd.DataFrame
        Updated DataFrame after sales scrutiny analysis.
    """
    # run sales validity:
    ss = SalesScrutinyStudy(df_in, settings, model_group=model_group)
    ss.write(f"out/sales_scrutiny/{model_group}")

    # clean sales data:
    return ss.get_scrutinized(df_in, verbose=verbose)

run_sales_scrutiny_per_model_group

run_sales_scrutiny_per_model_group(df_in, settings, verbose=False)

Run sales scrutiny analysis for each model group within a SalesUniversePair.

Parameters:

Name Type Description Default
df_in DataFrame

The data that you want to analyze

required
settings dict

Configuration settings.

required
verbose bool

If True, enables verbose logging. Defaults to False.

False

Returns:

Type Description
SalesUniversePair

Updated DataFrame after sales scrutiny analysis.

Source code in openavmkit/sales_scrutiny_study.py
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
def run_sales_scrutiny_per_model_group(
    df_in: pd.DataFrame, settings: dict, verbose=False
) -> pd.DataFrame:
    """
    Run sales scrutiny analysis for each model group within a SalesUniversePair.

    Parameters
    ----------
    df_in : pd.DataFrame
        The data that you want to analyze
    settings : dict
        Configuration settings.
    verbose : bool, optional
        If True, enables verbose logging. Defaults to False.

    Returns
    -------
    SalesUniversePair
        Updated DataFrame after sales scrutiny analysis.
    """
    # Run sales scrutiny for each model group
    df = do_per_model_group(
        df_in.copy(),
        settings,
        run_sales_scrutiny,
        {"settings": settings, "verbose": verbose},
        key="key_sale",
    )
    return df