Skip to content

openavmkit.filters

resolve_bool_filter

resolve_bool_filter(df, f)

Resolve a list of filters using a boolean operator.

Iterates through each filter in the list (after the operator) and combines their boolean indices using the specified boolean operator ("and", "or", "nand", "nor", "xor", "xnor").

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
f list

List where the first element is the boolean operator and the remaining elements are filter objects.

required

Returns:

Type Description
Series

Boolean Series resulting from applying the boolean operator.

Source code in openavmkit/filters.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def resolve_bool_filter(df: pd.DataFrame, f: list) -> pd.Series:
    """
    Resolve a list of filters using a boolean operator.

    Iterates through each filter in the list (after the operator) and combines their
    boolean indices using the specified boolean operator ("and", "or", "nand", "nor",
    "xor", "xnor").

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame.
    f : list
        List where the first element is the boolean operator and the remaining elements
        are filter objects.

    Returns
    -------
    pandas.Series
        Boolean Series resulting from applying the boolean operator.
    """

    operator = f[0]
    values = f[1:]

    final_index = None

    for v in values:
        selected_index = resolve_filter(df, v)

        if final_index is None:
            final_index = selected_index
            continue

        if operator == "and":
            final_index = final_index & selected_index
        elif operator == "nand":
            final_index = ~(final_index & selected_index)
        elif operator == "or":
            final_index = final_index | selected_index
        elif operator == "nor":
            final_index = ~(final_index | selected_index)
        elif operator == "xor":
            final_index = final_index ^ selected_index
        elif operator == "xnor":
            final_index = ~(final_index ^ selected_index)

    return final_index

resolve_filter

resolve_filter(df, f, rename_map=None)

Resolve a filter list into a boolean Series for the DataFrame (which can be used for selection).

For basic operators, the filter list must contain an operator, a field, and an optional value. For boolean operators, the filter list must contain a boolean operator, followed by a list of filters.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
f list

Filter list.

required
rename_map dict

Optional mapping of original to renamed columns.

None

Returns:

Type Description
Series

Boolean Series corresponding to the filter.

Raises:

Type Description
ValueError

If the operator is unknown.

Source code in openavmkit/filters.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def resolve_filter(df: pd.DataFrame, f: list, rename_map: dict = None) -> pd.Series:
    """
    Resolve a filter list into a boolean Series for the DataFrame (which can be used for selection).

    For basic operators, the filter list must contain an operator, a field, and an
    optional value. For boolean operators, the filter list must contain a boolean
    operator, followed by a list of filters.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame.
    f : list
        Filter list.
    rename_map : dict, optional
        Optional mapping of original to renamed columns.

    Returns
    -------
    pandas.Series
        Boolean Series corresponding to the filter.

    Raises
    ------
    ValueError
        If the operator is unknown.
    """

    if len(f) == 0:
        return pd.Series(False, index=df.index)

    operator = f[0]

    # check if operator is a boolean operator:
    if operator == "not":
        return resolve_not_filter(df, f)
    elif _is_bool_operator(operator):
        return resolve_bool_filter(df, f)
    else:
        field = f[1]
        # Handle field name resolution with rename_map
        if rename_map:
            # Create reverse map for looking up original names
            reverse_map = {v: k for k, v in rename_map.items()}
            if field in reverse_map and reverse_map[field] in df:
                field = reverse_map[field]
            elif field in rename_map and rename_map[field] in df:
                field = rename_map[field]

        if len(f) == 3:
            value = f[2]
        else:
            value = None

        if isinstance(value, str):
            if value.startswith("str:"):
                value = value[4:]

        if operator == ">":
            return df[field].fillna(0).gt(value)
        if operator == "<":
            return df[field].fillna(0).lt(value)
        if operator == ">=":
            return df[field].fillna(0).ge(value)
        if operator == "<=":
            return df[field].fillna(0).le(value)
        if operator == "==":
            return df[field].eq(value)
        if operator == "!=":
            return df[field].ne(value)
        if operator == "isin":
            return df[field].isin(value)
        if operator == "notin":
            return ~df[field].isin(value)
        if operator == "isempty":
            return pd.isna(df[field]) | df[field].astype(str).str.strip().eq("")
        if operator == "iszero":
            return df[field].eq(0)
        if operator == "iszeroempty":
            return (
                df[field].eq(0)
                | pd.isna(df[field])
                | df[field].astype(str).str.strip().eq("")
            )
        if operator == "contains":
            if isinstance(value, str):
                selection = df[field].str.contains(value)
            elif isinstance(value, list):
                selection = df[field].str.contains(value[0])
                for v in value[1:]:
                    selection |= df[field].str.contains(v)
            else:
                raise ValueError(
                    f"Value must be a string or list for operator {operator}, found: {type(value)}"
                )
            return selection
        if operator == "contains_case_insensitive":
            if isinstance(value, str):
                selection = df[field].str.contains(value, case=False)
            elif isinstance(value, list):
                selection = df[field].str.contains(value[0], case=False)
                for v in value[1:]:
                    selection |= df[field].str.contains(v, case=False)
            else:
                raise ValueError(
                    f"Value must be a string or list for operator {operator}, found: {type(value)}"
                )
            return selection

    raise ValueError(f"Unknown operator {operator}")

resolve_not_filter

resolve_not_filter(df, f)

Resolve a NOT filter.

The first element of the filter list must be "not", followed by a filter list.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
f list

Filter list.

required

Returns:

Type Description
Series

Boolean Series resulting from applying the NOT operator.

Source code in openavmkit/filters.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def resolve_not_filter(df: pd.DataFrame, f: list) -> pd.Series:
    """
    Resolve a NOT filter.

    The first element of the filter list must be "not", followed by a filter list.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame.
    f : list
        Filter list.

    Returns
    -------
    pandas.Series
        Boolean Series resulting from applying the NOT operator.
    """
    if len(f) < 2:
        raise ValueError("NOT operator requires at least one argument")

    values = f[1:]
    if len(values) > 1:
        raise ValueError(f"NOT operator only accepts one argument")

    selected_index = resolve_filter(df, values[0])
    return ~selected_index

select_filter

select_filter(df, f)

Select a subset of the DataFrame based on a list of filters.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required
f list

Filter expressed as a list.

required

Returns:

Type Description
DataFrame

Filtered DataFrame.

Source code in openavmkit/filters.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
def select_filter(df: pd.DataFrame, f: list) -> pd.DataFrame:
    """
    Select a subset of the DataFrame based on a list of filters.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame.
    f : list
        Filter expressed as a list.

    Returns
    -------
    pandas.DataFrame
        Filtered DataFrame.
    """
    resolved_index = resolve_filter(df, f)
    return df.loc[resolved_index]

validate_filter

validate_filter(f)

Validate a single filter list.

Checks that the filter's operator is appropriate for the value type.

Parameters:

Name Type Description Default
f list

Filter expressed as a list.

required

Returns:

Type Description
bool

True if the filter is valid.

Raises:

Type Description
ValueError

If the value type does not match the operator requirements.

Source code in openavmkit/filters.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def validate_filter(f: list):
    """
    Validate a single filter list.

    Checks that the filter's operator is appropriate for the value type.

    Parameters
    ----------
    f : list
        Filter expressed as a list.

    Returns
    -------
    bool
        True if the filter is valid.

    Raises
    ------
    ValueError
        If the value type does not match the operator requirements.
    """
    operator = f[0]
    if operator in ["and", "or"]:
        pass
    else:
        value = f[2]

        if operator in [">", "<", ">=", "<="]:
            if not isinstance(value, (int, float, bool)):
                raise ValueError(f"Value must be a number for operator {operator}")
        if operator in ["isin", "notin"]:
            if not isinstance(value, list):
                raise ValueError(f"Value must be a list for operator {operator}")
        if operator == "contains":
            if not isinstance(value, str):
                raise ValueError(f"Value must be a string for operator {operator}")
    return True

validate_filter_list

validate_filter_list(filters)

Validate a list of filter lists.

Parameters:

Name Type Description Default
filters list[list]

List of filters (each filter is a list).

required

Returns:

Type Description
bool

True if all filters are valid.

Source code in openavmkit/filters.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def validate_filter_list(filters: list[list]):
    """
    Validate a list of filter lists.

    Parameters
    ----------
    filters : list[list]
        List of filters (each filter is a list).

    Returns
    -------
    bool
        True if all filters are valid.
    """
    for f in filters:
        validate_filter(f)
    return True