API Reference¶

This page is automatically generated from the docstrings in our code.

`pyecoacc.util.analytics` ¶

`compare_models_cv(X, y, model_dict, cv=5, cv_method='stratified', individuals=None, random_state=42, round_digits=3)` ¶

Compute summary tables to compare models.

Parameters:

Name	Type	Description	Default
`X`	`array`	feature matrix	required
`y`	`array`	labels vector	required
`model_dict`	`dict`	a dictionary of models to evaluate with format {"model_name": model}	required
`cv`	`int`	number of cross-validation splits. Defaults to 5.	`5`
`cv_method`	`str`	cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".	`'stratified'`
`individuals`	`array`	individual identifiers for grouping. Defaults to None.	`None`
`random_state`	`int`	random state for reproducibility. Defaults to 42.	`42`
`round_digits`	`int`	number of decimal places to round results. Defaults to 3.	`3`

Returns:

Name	Type	Description
`accuracy`	`DataFrame`	summary table of overall accuracy across CV splits
`recall`	`DataFrame`	summary table of recall per model
`precision`	`DataFrame`	summary table of precision per model
`f1`	`DataFrame`	summary table of F1-score per model
`all_data`	`dict`	detailed results per model

Source code in pyecoacc/util/analytics.py

def compare_models_cv(X, y, model_dict, cv=5, cv_method="stratified", individuals=None,
                      random_state=42, round_digits=3):
    """Compute summary tables to compare models. 

    Args:
        X (np.array): feature matrix
        y (np.array): labels vector
        model_dict (dict): a dictionary of models to evaluate with format {"model_name": model}
        cv (int, optional): number of cross-validation splits. Defaults to 5.
        cv_method (str, optional): cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".
        individuals (np.array, optional): individual identifiers for grouping. Defaults to None.
        random_state (int, optional): random state for reproducibility. Defaults to 42.
        round_digits (int, optional): number of decimal places to round results. Defaults to 3.

    Returns:
        accuracy(DataFrame): summary table of overall accuracy across CV splits
        recall(DataFrame): summary table of recall per model
        precision(DataFrame): summary table of precision per model
        f1(DataFrame): summary table of F1-score per model
        all_data (dict): detailed results per model
    """

    all_data = dict()
    accuracy = dict()

    for model_name, clf in model_dict.items():
        print(f"Starting model {model_name}...")

        model_accuracy, mean_report, std_report, splits = model_analytics_cv(X, y, clf,
                                                                             cv=cv,
                                                                             cv_method=cv_method,
                                                                             individuals=individuals,
                                                                             random_state=random_state)

        all_data[model_name] = {"mean_report": mean_report, "std_report": std_report, "splits": splits}
        accuracy[model_name] = model_accuracy

    # Overall
    accuracy = pd.DataFrame(accuracy)
    accuracy.loc["mean"] = accuracy.mean().rename('mean')
    accuracy.loc["std"] = accuracy.std().rename('std')

    # Recall, Precision, F1
    mean_std_reports = {name: info["mean_report"].round(round_digits).astype(str) + " (" + info["std_report"].round(round_digits).astype(str) + ")"
                        for name, info in all_data.items()}
    recall = pd.DataFrame({name: frame.loc["recall"] for name, frame in mean_std_reports.items()})
    precision = pd.DataFrame({name: frame.loc["precision"] for name, frame in mean_std_reports.items()})
    f1 = pd.DataFrame({name: frame.loc["f1-score"] for name, frame in mean_std_reports.items()})

    return accuracy, recall, precision, f1, all_data

`compute_confusion_matrix(y_true, y_pred, normalize='true', round=2)` ¶

Compute confusion matrix.

Parameters:

Name	Type	Description	Default
`y_true`	`array or list`	Ground truth labels.	required
`y_pred`	`array or list`	Predicted labels.	required
`normalize`	`str`	When set to 'true' normalize rows of confusion matrix. Passed to sklearn.metrics.confusion_matrix. Defaults to 'true'.	`'true'`
`round`	`int`	Number of decimal points to keep. Defaults to 2.	`2`

Returns:

Name	Type	Description
`confusion_matrix`	`DataFrame`	Confusion matrix as a pandas DataFrame.

Source code in pyecoacc/util/analytics.py

def compute_confusion_matrix(y_true, y_pred, normalize='true', round=2):
    """Compute confusion matrix.

    Args:
        y_true (np.array or list): Ground truth labels.
        y_pred (np.array or list): Predicted labels.
        normalize (str, optional): When set to 'true' normalize rows of confusion matrix. Passed to sklearn.metrics.confusion_matrix. Defaults to 'true'.
        round (int, optional): Number of decimal points to keep. Defaults to 2.

    Returns:
        confusion_matrix (pd.DataFrame): Confusion matrix as a pandas DataFrame.
    """
    lbls = list(np.unique(y_true))
    cm = confusion_matrix(y_true, y_pred, labels=lbls, normalize=normalize)
    return pd.DataFrame(cm, index=lbls, columns=lbls).round(round)

`model_analytics_cv(X, y, model, cv=5, cv_method='stratified', individuals=None, random_state=42)` ¶

Computes model analytics table.

Parameters:

Name	Type	Description	Default
`X`	`array`	feature matrix	required
`y`	`array`	labels vector	required
`model`	`Pipeline or Estimator`	model to evaluate	required
`cv`	`int`	number of cross-validation splits. Defaults to 5.	`5`
`cv_method`	`str`	cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".	`'stratified'`
`individuals`	`array`	individual identifiers for grouping. Defaults to None.	`None`
`random_state`	`int`	random state for reproducibility. Defaults to 42.	`42`

Returns:

Name	Type	Description
`overall_accuracy`	`DataFrame`	overall accuracy per CV split
`mean_report`	`DataFrame`	mean classification report across CV splits
`std_report`	`DataFrame`	standard deviation of classification report across CV splits
`splits_output`	`dict`	detailed classification reports per CV split

Source code in pyecoacc/util/analytics.py

def model_analytics_cv(X, y, model, cv=5, cv_method="stratified", individuals=None,
                       random_state=42):
    """Computes model analytics table.

    Args:
        X (np.array): feature matrix
        y (np.array): labels vector
        model (Pipeline or Estimator): model to evaluate
        cv (int, optional): number of cross-validation splits. Defaults to 5.
        cv_method (str, optional): cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".
        individuals (np.array, optional): individual identifiers for grouping. Defaults to None.
        random_state (int, optional): random state for reproducibility. Defaults to 42.


    Returns:
        overall_accuracy (DataFrame): overall accuracy per CV split
        mean_report (DataFrame): mean classification report across CV splits
        std_report (DataFrame): standard deviation of classification report across CV splits
        splits_output (dict): detailed classification reports per CV split
    """
    splits_output = dict()
    overall_accuracy = dict()

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    split_indices = None

    if cv_method == "stratified":
        cross_val_spliter = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
        split_indices = cross_val_spliter.split(X, y)

    elif cv_method == "animal-groups":
        cross_val_splitter = GroupKFold(n_splits=cv)
        split_indices = cross_val_splitter.split(X, y, individuals)     

    elif cv_method == "LOIO":
        cross_val_splitter = LeaveOneGroupOut()
        split_indices = cross_val_splitter.split(X, y, individuals)

    else:
        raise ValueError(f"Unsupported cross-validation method: {cv_method}")

    for i, (train_index, test_index) in enumerate(split_indices):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_encoded[train_index], y_encoded[test_index]

        y_hat = model.fit(X_train, y_train).predict(X_test)

        group_name = None
        if cv_method in ("stratified", "animal-groups"):
            group_name = i + 1
        elif cv_method == "LOIO":
            group_name = np.unique(individuals[test_index])[0]

        overall_accuracy[f"split-{group_name}"] = (y_hat == y_test).mean()

        report = classification_report(le.inverse_transform(y_test), le.inverse_transform(y_hat),
                                       labels=le.classes_,
                                       output_dict=True,
                                       zero_division=0)
        report = pd.DataFrame(report)
        report.drop("accuracy", axis=1, inplace=True)
        splits_output[f"split-{group_name}"] = report

    mean_report = pd.concat(splits_output.values()).groupby(level=0).mean()
    std_report = pd.concat(splits_output.values()).groupby(level=0).std()

    return overall_accuracy, mean_report, std_report, splits_output

`pyecoacc.util.time_budget` ¶

`compute_time_budget(raw_acc, clf, cm=None, apply_cm_correction=True)` ¶

Use a trained classifier to compute a time budget.

Parameters:

Name	Type	Description	Default
`raw_acc`	`array`	the raw accelerometer data to compute the time budget from	required
`clf`	`Pipeline`	the trained classifier	required
`cm`	`DataFrame`	the confusion matrix used for correction. Defaults to None.	`None`
`apply_cm_correction`	`bool`	If False, no correction is applied. Defaults to True.	`True`

Returns:

Name	Type	Description
`budget`	`Series`	the time budget

Source code in pyecoacc/util/time_budget.py

def compute_time_budget(raw_acc, clf, cm=None, apply_cm_correction=True):
    """Use a trained classifier to compute a time budget. 

    Args:
        raw_acc (np.array): the raw accelerometer data to compute the time budget from
        clf (Pipeline): the trained classifier
        cm (pd.DataFrame, optional): the confusion matrix used for correction. Defaults to None.
        apply_cm_correction (bool, optional): If False, no correction is applied. Defaults to True.


    Returns:
        budget (pd.Series): the time budget
    """

    if apply_cm_correction and cm is None:
        raise ValueError("Confusion matrix must be provided if apply_cm_correction=True")

    y_hat = clf.predict(raw_acc)
    tb = pd.Series(y_hat).value_counts(normalize=True)

    if apply_cm_correction:
        tb = confusion_matrix_correction(tb[cm.index], cm)

    return tb

`confusion_matrix_correction(budget, cm)` ¶

Apply the confusion matrix correction for time budgets.

Parameters:

Name	Type	Description	Default
`budget`	`Series`	the raw time budget	required
`cm`	`DataFrame`	the confusion matrix used for correction	required

Returns:

Name	Type	Description
`corrected_budget`	`Series`	the corrected time budget

Source code in pyecoacc/util/time_budget.py

def confusion_matrix_correction(budget, cm):
    """Apply the confusion matrix correction for time budgets. 

    Args:
        budget (pd.Series): the raw time budget
        cm (pd.DataFrame): the confusion matrix used for correction

    Returns:
        corrected_budget (pd.Series): the corrected time budget
    """
    corrected = np.dot(np.linalg.inv(cm).T, budget)
    return pd.Series(corrected, index=budget.index)

options: show_submodules: false members: - confusion_matrix_correction - compute_time_budget filters: ["!^"]

`pyecoacc.util.training` ¶

`train_compute_cm(model, X, y, cm_estimation_percent=0.2, round=2)` ¶

summary

Parameters:

Name	Type	Description	Default
`model`	`Pipeline or Estimator`	the model to train and evaluate	required
`X`	`array`	featuers to train and evaluate on	required
`y`	`array`	labels to train and evaluate on	required
`cm_estimation_percent`	`float`	the fraction of data used for confusion matrix estimation. Defaults to .2.	`0.2`
`round`	`int`	the number of decimal places to round the confusion matrix. Defaults to 2.	`2`

Returns:

Name	Type	Description
`confusion_matrix`	`DataFrame`	the estimated confusion matrix

Source code in pyecoacc/util/training.py

def train_compute_cm(model, X, y, cm_estimation_percent=.2, round=2):
    """_summary_

    Args:
        model (Pipeline or Estimator): the model to train and evaluate
        X (np.array): featuers to train and evaluate on
        y (np.array): labels to train and evaluate on
        cm_estimation_percent (float, optional): the fraction of data used for confusion matrix estimation. Defaults to .2.
        round (int, optional): the number of decimal places to round the confusion matrix. Defaults to 2.

    Returns:
        confusion_matrix (pd.DataFrame): the estimated confusion matrix
    """
    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cm_estimation_percent)

    # train
    model.fit(X_train, y_train)

    # estimate
    y_hat = model.predict(X_test)
    cm = compute_confusion_matrix(y_test, y_hat, round=round)

    return cm

options: show_submodules: false members: - train_compute_cm filters: ["!^"]

`pyecoacc.util.preprocessing` ¶

`long_to_wide_multi_animal(df, id_col='AnimalID', segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True)` ¶

Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for multiple animals with continuous data for each one.

Assumes (approximately) constant sampling interval; drops the last partial segment. Assumes no gaps in the data of each animal.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`segment_duration`	`str`	encodes the duration of each segment. Defaults to "1s".	`'1s'`
`xcol`	`str`	the name of the column for the X acceleration axis. Defaults to "accX".	`'accX'`
`ycol`	`str`	the name of the column for the Y acceleration axis. Defaults to "accY".	`'accY'`
`zcol`	`str`	the name of the column for the Z acceleration axis. Defaults to "accZ".	`'accZ'`
`timestamp_col`	`str`	the name of the column for the timestamp. Defaults to "Timestamp".	`'Timestamp'`
`sort_by_time`	`bool`	if True, sort the data by timestamp. Defaults to True.	`True`
`id_col`	`str`	description. Defaults to "AnimalID".	`'AnimalID'`

Returns:

Name	Type	Description
`segment_table`	`DataFrame`	a wide dataframe with one row per non-overlapping segment.

Source code in pyecoacc/util/preprocessing.py

def long_to_wide_multi_animal(
    df: pd.DataFrame, 
    id_col: str = "AnimalID",
    segment_duration: str = "1s",
    xcol: str = "accX",
    ycol: str = "accY",
    zcol: str = "accZ",
    timestamp_col: str = "Timestamp",
    sort_by_time: bool = True,
) -> pd.DataFrame:
    """Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for multiple animals with continuous data for each one.     


    Assumes (approximately) constant sampling interval; drops the last partial segment.
    Assumes no gaps in the data of each animal. 

    Args:
        df (pd.DataFrame): input dataframe
        segment_duration (str, optional): encodes the duration of each segment. Defaults to "1s".
        xcol (str, optional): the name of the column for the X acceleration axis. Defaults to "accX".
        ycol (str, optional): the name of the column for the Y acceleration axis. Defaults to "accY".
        zcol (str, optional): the name of the column for the Z acceleration axis. Defaults to "accZ".
        timestamp_col (str, optional): the name of the column for the timestamp. Defaults to "Timestamp".
        sort_by_time (bool, optional): if True, sort the data by timestamp. Defaults to True.
        id_col (str, optional): _description_. Defaults to "AnimalID".

    Returns:
        segment_table (pd.DataFrame): a wide dataframe with one row per non-overlapping segment.
    """

    all_animals = [] 

    for animal_id, animal in df.groupby(id_col):
        segments = long_to_wide_segments(animal, 
                                         segment_duration=segment_duration,
                                         xcol=xcol,
                                         ycol=ycol,
                                         zcol=zcol,
                                         timestamp_col=timestamp_col,
                                         sort_by_time=sort_by_time)
        segments.index = pd.MultiIndex.from_product([animal_id, segments.index])
        all_animals.append(segments)    

    out = pd.concat(all_animals, axis=0)
    return out

`long_to_wide_segments(df, segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True)` ¶

Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for a single animal with continuous data.

The input format uses the original long-shape columns: X, Y, Z, timestamp. Assumes (approximately) constant sampling interval; drops the last partial segment. Assumes no gaps.

We thank an anonymous reviewer for contributing this function.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	input dataframe	required
`segment_duration`	`str`	encodes the duration of each segment. Defaults to "1s".	`'1s'`
`xcol`	`str`	the name of the column for the X acceleration axis. Defaults to "accX".	`'accX'`
`ycol`	`str`	the name of the column for the Y acceleration axis. Defaults to "accY".	`'accY'`
`zcol`	`str`	the name of the column for the Z acceleration axis. Defaults to "accZ".	`'accZ'`
`timestamp_col`	`str`	the name of the column for the timestamp. Defaults to "Timestamp".	`'Timestamp'`
`sort_by_time`	`bool`	if True, sort the data by timestamp. Defaults to True.	`True`

Returns:

Name	Type	Description
`segment_table`	`DataFrame`	a wide dataframe with one row per non-overlapping segment.

Source code in pyecoacc/util/preprocessing.py

def long_to_wide_segments(
    df: pd.DataFrame, 
    segment_duration: str = "1s",
    xcol: str = "accX",
    ycol: str = "accY",
    zcol: str = "accZ",
    timestamp_col: str = "Timestamp",
    sort_by_time: bool = True,
) -> pd.DataFrame:
    """ Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for a single animal with continuous data.    

    The input format uses the original long-shape columns: X, Y, Z, timestamp. Assumes (approximately) constant sampling interval; drops the last partial segment.
    Assumes no gaps. 

    We thank an anonymous reviewer for contributing this function.


    Args:
        df (pd.DataFrame): input dataframe
        segment_duration (str, optional): encodes the duration of each segment. Defaults to "1s".
        xcol (str, optional): the name of the column for the X acceleration axis. Defaults to "accX".
        ycol (str, optional): the name of the column for the Y acceleration axis. Defaults to "accY".
        zcol (str, optional): the name of the column for the Z acceleration axis. Defaults to "accZ".
        timestamp_col (str, optional): the name of the column for the timestamp. Defaults to "Timestamp".
        sort_by_time (bool, optional): if True, sort the data by timestamp. Defaults to True.


    Returns:
        segment_table (pd.DataFrame): a wide dataframe with one row per non-overlapping segment.
    """

    d = df[[timestamp_col, xcol, ycol, zcol]].copy()
    d[timestamp_col] = pd.to_datetime(d[timestamp_col], errors="coerce")
    d = d.dropna(subset=[timestamp_col])

    if sort_by_time:
        d = d.sort_values(timestamp_col, kind="mergesort").reset_index(drop=True)

    if len(d) < 2:
        raise ValueError("Need at least 2 rows to infer sampling interval")

    dt = (d[timestamp_col].iloc[1] - d[timestamp_col].iloc[0]).total_seconds()
    if dt <= 0:
        raise ValueError("Non-positive sampling interval (check timestamp ordering/duplicates)")

    seg_s = pd.to_timedelta(segment_duration).total_seconds()
    samples_per_seg = int(round(seg_s / dt))
    if samples_per_seg <= 0:
        raise ValueError("segment_duration too small for inferred sampling interval")

    acc = d[[xcol, ycol, zcol]].to_numpy()  # (N, 3)

    nseg = acc.shape[0] // samples_per_seg
    acc = acc[: nseg * samples_per_seg]

    wide = acc.reshape(nseg, samples_per_seg * 3)  # (nseg, 3*T)

    # column names: x,y,z,x.1,y.1,z.1,...
    base = ["x", "y", "z"]
    cols = []
    for i in range(samples_per_seg):
        suffix = "" if i == 0 else f".{i}"
        cols.extend([f"{b}{suffix}" for b in base])

    out = pd.DataFrame(wide, 
                       columns=cols,
                       index=d[timestamp_col].iloc[::samples_per_seg].iloc[:nseg].to_numpy())
    return out

API Reference¶

pyecoacc.util.analytics ¶

compare_models_cv(X, y, model_dict, cv=5, cv_method='stratified', individuals=None, random_state=42, round_digits=3) ¶

compute_confusion_matrix(y_true, y_pred, normalize='true', round=2) ¶

model_analytics_cv(X, y, model, cv=5, cv_method='stratified', individuals=None, random_state=42) ¶

pyecoacc.util.time_budget ¶

compute_time_budget(raw_acc, clf, cm=None, apply_cm_correction=True) ¶

confusion_matrix_correction(budget, cm) ¶

pyecoacc.util.training ¶

train_compute_cm(model, X, y, cm_estimation_percent=0.2, round=2) ¶

pyecoacc.util.preprocessing ¶

long_to_wide_multi_animal(df, id_col='AnimalID', segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True) ¶

long_to_wide_segments(df, segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True) ¶

`pyecoacc.util.analytics` ¶

`compare_models_cv(X, y, model_dict, cv=5, cv_method='stratified', individuals=None, random_state=42, round_digits=3)` ¶

`compute_confusion_matrix(y_true, y_pred, normalize='true', round=2)` ¶

`model_analytics_cv(X, y, model, cv=5, cv_method='stratified', individuals=None, random_state=42)` ¶

`pyecoacc.util.time_budget` ¶

`compute_time_budget(raw_acc, clf, cm=None, apply_cm_correction=True)` ¶

`confusion_matrix_correction(budget, cm)` ¶

`pyecoacc.util.training` ¶

`train_compute_cm(model, X, y, cm_estimation_percent=0.2, round=2)` ¶

`pyecoacc.util.preprocessing` ¶

`long_to_wide_multi_animal(df, id_col='AnimalID', segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True)` ¶

`long_to_wide_segments(df, segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True)` ¶