`openavmkit.horizontal_equity_study`

HorizontalEquityClusterSummary

HorizontalEquityClusterSummary(id, count, chd, min, max, median)

Summary for an individual horizontal equity cluster.

Attributes:

Name	Type	Description
`id`	`str`	Identifier of the cluster.
`count`	`int`	Number of records in the cluster.
`chd`	`float`	CHD value for the cluster.
`min`	`float`	Minimum value in the cluster.
`max`	`float`	Maximum value in the cluster.
`median`	`float`	Median value in the cluster.

Initialize a HorizontalEquityClusterSummary instance.

Parameters:

Name	Type	Description	Default
`id`	`str`	Cluster identifier.	required
`count`	`int`	Number of records in the cluster.	required
`chd`	`float`	COD value for the cluster.	required
`min`	`float`	Minimum value in the cluster.	required
`max`	`float`	Maximum value in the cluster.	required
`median`	`float`	Median value in the cluster.	required

Source code in openavmkit/horizontal_equity_study.py

def __init__(
    self, id: str, count: int, chd: float, min: float, max: float, median: float
):
    """
    Initialize a HorizontalEquityClusterSummary instance.

    Parameters
    ----------
    id : str
        Cluster identifier.
    count : int
        Number of records in the cluster.
    chd : float
        COD value for the cluster.
    min : float
        Minimum value in the cluster.
    max : float
        Maximum value in the cluster.
    median : float
        Median value in the cluster.
    """
    self.id = id
    self.count = count
    self.chd = chd
    self.min = min
    self.max = max
    self.median = median

HorizontalEquityStudy

HorizontalEquityStudy(df, field_cluster, field_value)

Perform horizontal equity analysis and summarize the results.

Attributes:

Name	Type	Description
`summary`	`HorizontalEquitySummary`	Overall summary statistics.
`cluster_summaries`	`dict[str, HorizontalEquityClusterSummary]`	Dictionary mapping cluster IDs to their summaries.

Initialize a HorizontalEquityStudy instance by computing cluster summaries.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame containing data for horizontal equity analysis.	required
`field_cluster`	`str`	Column name indicating cluster membership.	required
`field_value`	`str`	Column name of the values to analyze.	required

Source code in openavmkit/horizontal_equity_study.py

def __init__(self, df: pd.DataFrame, field_cluster: str, field_value: str):
    """
    Initialize a HorizontalEquityStudy instance by computing cluster summaries.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame containing data for horizontal equity analysis.
    field_cluster : str
        Column name indicating cluster membership.
    field_value : str
        Column name of the values to analyze.
    """

    clusters = df[field_cluster].unique()
    self.cluster_summaries = {}

    chds = np.array([])
    for cluster in clusters:
        df_cluster = df[df[field_cluster].eq(cluster)]
        count = len(df_cluster)
        if count > 0:
            chd = stats.calc_cod(df_cluster[field_value].values)
            min_value = df_cluster[field_value].min()
            max_value = df_cluster[field_value].max()
            median_value = df_cluster[field_value].median()
        else:
            chd = float("nan")
            min_value = float("nan")
            max_value = float("nan")
            median_value = float("nan")
        summary = HorizontalEquityClusterSummary(
            cluster, count, chd, min_value, max_value, median_value
        )
        self.cluster_summaries[cluster] = summary
        chds = np.append(chds, chd)

    if len(chds) > 0:
        min_chd = np.min(chds)
        max_chd = np.max(chds)
        med_chd = float(np.median(chds))
        p05_chd = np.quantile(chds, 0.05)
        p25_chd = np.quantile(chds, 0.25)
        p75_chd = np.quantile(chds, 0.75)
        p95_chd = np.quantile(chds, 0.95)
    else:
        min_chd = float("nan")
        max_chd = float("nan")
        med_chd = float("nan")
        p05_chd = float("nan")
        p25_chd = float("nan")
        p75_chd = float("nan")
        p95_chd = float("nan")

    self.summary = HorizontalEquitySummary(
        len(df), len(clusters), min_chd, max_chd, med_chd, p05_chd, p25_chd, p75_chd, p95_chd
    )

HorizontalEquitySummary

HorizontalEquitySummary(rows, clusters, min_chd, max_chd, median_chd, p05_chd, p25_chd, p75_chd, p95_chd)

Summary statistics for horizontal equity analysis.

Attributes:

Name	Type	Description
`rows`	`int`	Total number of rows in the input DataFrame.
`clusters`	`int`	Total number of clusters identified.
`min_chd`	`float`	Minimum CHD (Coefficient of Horizontal Dispersion) value of any cluster.
`max_chd`	`float`	Maximum CHD value of any cluster.
`median_chd`	`float`	Median CHD value of all clusters.
`p05_chd`	`float`	5th percentile CHD value
`p25_chd`	`float`	25th percentile CHD value
`p75_chd`	`float`	75th percentile CHD value
`p95_chd`	`float`	95th percentile CHD value

Initialize a HorizontalEquitySummary instance.

Parameters:

Name	Type	Description	Default
`rows`	`int`	Total number of rows in the DataFrame.	required
`clusters`	`int`	Total number of clusters.	required
`min_chd`	`float`	Minimum CHD value.	required
`max_chd`	`float`	Maximum CHD value.	required
`median_chd`	`float`	Median CHD value.	required
`p05_chd`	`float`	5th percentile CHD value	required
`p25_chd`	`float`	25th percentile CHD value	required
`p75_chd`	`float`	75th percentile CHD value	required
`p95_chd`	`float`	95th percentile CHD value	required

Source code in openavmkit/horizontal_equity_study.py

def __init__(
    self,
    rows: int,
    clusters: int,
    min_chd: float,
    max_chd: float,
    median_chd: float,
    p05_chd: float,
    p25_chd: float,
    p75_chd: float,
    p95_chd: float
):
    """
    Initialize a HorizontalEquitySummary instance.

    Parameters
    ----------
    rows : int
        Total number of rows in the DataFrame.
    clusters : int
        Total number of clusters.
    min_chd : float
        Minimum CHD value.
    max_chd : float
        Maximum CHD value.
    median_chd : float
        Median CHD value.
    p05_chd : float
        5th percentile CHD value
    p25_chd : float
        25th percentile CHD value
    p75_chd : float
        75th percentile CHD value
    p95_chd : float
        95th percentile CHD value
    """
    self.rows = rows
    self.clusters = clusters
    self.min_chd = min_chd
    self.max_chd = max_chd
    self.median_chd = median_chd
    self.p05_chd = p05_chd
    self.p25_chd = p25_chd
    self.p75_chd = p75_chd
    self.p95_chd = p95_chd

mark_horizontal_equity_clusters

mark_horizontal_equity_clusters(df, settings, verbose=False, settings_object='horizontal_equity', id_name='he_id', output_folder='', t=None)

Compute and mark horizontal equity clusters in the DataFrame.

Uses clustering (via make_clusters) based on a location field and categorical/numeric fields specified in settings to generate a horizontal equity cluster ID which is stored in the specified id_name column.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	Input DataFrame.	required
`settings`	`dict`	Settings dictionary.	required
`verbose`	`bool`	If True, prints progress information.	`False`
`settings_object`	`str`	The settings object to use for horizontal equity analysis.	`'horizontal_equity'`
`id_name`	`str`	Name of the column to store the horizontal equity cluster ID.	`'he_id'`
`output_folder`	`str`	Output folder path (stores information about the clusters for later use).	`''`
`t`	`TimingData`	TimingData object to record performance metrics.	`None`

Returns:

Type	Description
`DataFrame`	DataFrame with a new cluster ID column (`id_name`).

Source code in openavmkit/horizontal_equity_study.py

def mark_horizontal_equity_clusters(
    df: pd.DataFrame,
    settings: dict,
    verbose: bool = False,
    settings_object: str = "horizontal_equity",
    id_name: str = "he_id",
    output_folder: str = "",
    t: TimingData = None,
) -> pd.DataFrame:
    """
    Compute and mark horizontal equity clusters in the DataFrame.

    Uses clustering (via `make_clusters`) based on a location field and categorical/numeric
    fields specified in settings to generate a horizontal equity cluster ID which is stored
    in the specified `id_name` column.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame.
    settings : dict
        Settings dictionary.
    verbose : bool, optional
        If True, prints progress information.
    settings_object : str, optional
        The settings object to use for horizontal equity analysis.
    id_name : str, optional
        Name of the column to store the horizontal equity cluster ID.
    output_folder : str, optional
        Output folder path (stores information about the clusters for later use).
    t : TimingData, optional
        TimingData object to record performance metrics.

    Returns
    -------
    pandas.DataFrame
        DataFrame with a new cluster ID column (`id_name`).
    """

    he = settings.get("analysis", {}).get(settings_object, {})
    location = he.get("location", None)
    fields_categorical = he.get("fields_categorical", [])
    fields_numeric = he.get("fields_numeric", None)
    unit = area_unit(settings)

    split_on_vacant = True
    if "land" in id_name:
        split_on_vacant = False
    df[id_name], _, _ = make_clusters(
        df,
        location,
        fields_categorical,
        fields_numeric,
        split_on_vacant=split_on_vacant,
        verbose=verbose,
        output_folder=output_folder,
        unit=unit,
        t=t
    )
    return df

mark_horizontal_equity_clusters_per_model_group_sup

mark_horizontal_equity_clusters_per_model_group_sup(sup, settings, verbose=False, use_cache=True, do_land_clusters=True, do_impr_clusters=True)

Mark horizontal equity clusters on the 'universe' DataFrame of a SalesUniversePair.

Updates the 'universe' DataFrame with horizontal equity clusters by calling mark_horizontal_equity_clusters and then sets the updated DataFrame in sup.

Parameters:

Name	Type	Description	Default
`sup`	`SalesUniversePair`	SalesUniversePair containing sales and universe data.	required
`settings`	`dict`	Settings dictionary.	required
`verbose`	`bool`	If True, prints progress information.	`False`
`use_cache`	`bool`	If True, uses cached DataFrame if available.	`True`
`do_land_clusters`	`bool`	If True, marks land horizontal equity clusters.	`True`
`do_impr_clusters`	`bool`	If True, marks improvement horizontal equity clusters.	`True`

Returns:

Type	Description
`SalesUniversePair`	Updated SalesUniversePair with marked horizontal equity clusters.

Source code in openavmkit/horizontal_equity_study.py

def mark_horizontal_equity_clusters_per_model_group_sup(
    sup: SalesUniversePair,
    settings: dict,
    verbose: bool = False,
    use_cache: bool = True,
    do_land_clusters: bool = True,
    do_impr_clusters: bool = True,
) -> SalesUniversePair:
    """
    Mark horizontal equity clusters on the 'universe' DataFrame of a SalesUniversePair.

    Updates the 'universe' DataFrame with horizontal equity clusters by calling
    `mark_horizontal_equity_clusters` and then sets the updated DataFrame in `sup`.

    Parameters
    ----------
    sup : SalesUniversePair
        SalesUniversePair containing sales and universe data.
    settings : dict
        Settings dictionary.
    verbose : bool, optional
        If True, prints progress information.
    use_cache : bool, optional
        If True, uses cached DataFrame if available.
    do_land_clusters : bool, optional
        If True, marks land horizontal equity clusters.
    do_impr_clusters : bool, optional
        If True, marks improvement horizontal equity clusters.

    Returns
    -------
    SalesUniversePair
        Updated SalesUniversePair with marked horizontal equity clusters.
    """

    he = settings.get("analysis", {}).get("horizontal_equity", {})
    enabled = he.get("enabled", True)

    if enabled == False:
        if verbose:
            print(f"Skipping horizontal equity clustering...")
        return sup

    df_universe = sup["universe"]
    if verbose:
        print("")
        print("Marking horizontal equity clusters...")
    df_universe = _mark_horizontal_equity_clusters_per_model_group(
        df_universe,
        settings,
        verbose,
        output_folder="horizontal_equity/general",
        use_cache=use_cache,
    )
    if do_land_clusters:
        if verbose:
            print("")
            print("Marking LAND horizontal equity clusters...")
        le = settings.get("analysis", {}).get("land_equity", {})
        location = le.get("location", None)
        if location is None:
            warnings.warn("You are creating land equity clusters, but you haven't defined `analysis.land_equity.location`. You should at least provide a location field if you want to use this feature.")
        df_universe = _mark_horizontal_equity_clusters_per_model_group(
            df_universe,
            settings,
            verbose,
            settings_object="land_equity",
            id_name="land_he_id",
            output_folder="horizontal_equity/land",
            use_cache=use_cache,
        )
    if do_impr_clusters:
        if verbose:
            print("")
            print("Marking IMPROVEMENT horizontal equity clusters...")
        df_universe = _mark_horizontal_equity_clusters_per_model_group(
            df_universe,
            settings,
            verbose,
            settings_object="impr_equity",
            id_name="impr_he_id",
            output_folder="horizontal_equity/improvement",
            use_cache=use_cache,
        )
        sup.set("universe", df_universe)
    return sup