Skip to content

API Reference

This page is automatically generated from the docstrings in our code.

pyecoacc.util.analytics

compare_models_cv(X, y, model_dict, cv=5, cv_method='stratified', individuals=None, random_state=42, round_digits=3)

Compute summary tables to compare models.

Parameters:

Name Type Description Default
X array

feature matrix

required
y array

labels vector

required
model_dict dict

a dictionary of models to evaluate with format {"model_name": model}

required
cv int

number of cross-validation splits. Defaults to 5.

5
cv_method str

cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".

'stratified'
individuals array

individual identifiers for grouping. Defaults to None.

None
random_state int

random state for reproducibility. Defaults to 42.

42
round_digits int

number of decimal places to round results. Defaults to 3.

3

Returns:

Name Type Description
accuracy DataFrame

summary table of overall accuracy across CV splits

recall DataFrame

summary table of recall per model

precision DataFrame

summary table of precision per model

f1 DataFrame

summary table of F1-score per model

all_data dict

detailed results per model

Source code in pyecoacc/util/analytics.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def compare_models_cv(X, y, model_dict, cv=5, cv_method="stratified", individuals=None,
                      random_state=42, round_digits=3):
    """Compute summary tables to compare models. 

    Args:
        X (np.array): feature matrix
        y (np.array): labels vector
        model_dict (dict): a dictionary of models to evaluate with format {"model_name": model}
        cv (int, optional): number of cross-validation splits. Defaults to 5.
        cv_method (str, optional): cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".
        individuals (np.array, optional): individual identifiers for grouping. Defaults to None.
        random_state (int, optional): random state for reproducibility. Defaults to 42.
        round_digits (int, optional): number of decimal places to round results. Defaults to 3.

    Returns:
        accuracy(DataFrame): summary table of overall accuracy across CV splits
        recall(DataFrame): summary table of recall per model
        precision(DataFrame): summary table of precision per model
        f1(DataFrame): summary table of F1-score per model
        all_data (dict): detailed results per model
    """

    all_data = dict()
    accuracy = dict()

    for model_name, clf in model_dict.items():
        print(f"Starting model {model_name}...")

        model_accuracy, mean_report, std_report, splits = model_analytics_cv(X, y, clf,
                                                                             cv=cv,
                                                                             cv_method=cv_method,
                                                                             individuals=individuals,
                                                                             random_state=random_state)

        all_data[model_name] = {"mean_report": mean_report, "std_report": std_report, "splits": splits}
        accuracy[model_name] = model_accuracy

    # Overall
    accuracy = pd.DataFrame(accuracy)
    accuracy.loc["mean"] = accuracy.mean().rename('mean')
    accuracy.loc["std"] = accuracy.std().rename('std')

    # Recall, Precision, F1
    mean_std_reports = {name: info["mean_report"].round(round_digits).astype(str) + " (" + info["std_report"].round(round_digits).astype(str) + ")"
                        for name, info in all_data.items()}
    recall = pd.DataFrame({name: frame.loc["recall"] for name, frame in mean_std_reports.items()})
    precision = pd.DataFrame({name: frame.loc["precision"] for name, frame in mean_std_reports.items()})
    f1 = pd.DataFrame({name: frame.loc["f1-score"] for name, frame in mean_std_reports.items()})

    return accuracy, recall, precision, f1, all_data

compute_confusion_matrix(y_true, y_pred, normalize='true', round=2)

Compute confusion matrix.

Parameters:

Name Type Description Default
y_true array or list

Ground truth labels.

required
y_pred array or list

Predicted labels.

required
normalize str

When set to 'true' normalize rows of confusion matrix. Passed to sklearn.metrics.confusion_matrix. Defaults to 'true'.

'true'
round int

Number of decimal points to keep. Defaults to 2.

2

Returns:

Name Type Description
confusion_matrix DataFrame

Confusion matrix as a pandas DataFrame.

Source code in pyecoacc/util/analytics.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
def compute_confusion_matrix(y_true, y_pred, normalize='true', round=2):
    """Compute confusion matrix.

    Args:
        y_true (np.array or list): Ground truth labels.
        y_pred (np.array or list): Predicted labels.
        normalize (str, optional): When set to 'true' normalize rows of confusion matrix. Passed to sklearn.metrics.confusion_matrix. Defaults to 'true'.
        round (int, optional): Number of decimal points to keep. Defaults to 2.

    Returns:
        confusion_matrix (pd.DataFrame): Confusion matrix as a pandas DataFrame.
    """
    lbls = list(np.unique(y_true))
    cm = confusion_matrix(y_true, y_pred, labels=lbls, normalize=normalize)
    return pd.DataFrame(cm, index=lbls, columns=lbls).round(round)

model_analytics_cv(X, y, model, cv=5, cv_method='stratified', individuals=None, random_state=42)

Computes model analytics table.

Parameters:

Name Type Description Default
X array

feature matrix

required
y array

labels vector

required
model Pipeline or Estimator

model to evaluate

required
cv int

number of cross-validation splits. Defaults to 5.

5
cv_method str

cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".

'stratified'
individuals array

individual identifiers for grouping. Defaults to None.

None
random_state int

random state for reproducibility. Defaults to 42.

42

Returns:

Name Type Description
overall_accuracy DataFrame

overall accuracy per CV split

mean_report DataFrame

mean classification report across CV splits

std_report DataFrame

standard deviation of classification report across CV splits

splits_output dict

detailed classification reports per CV split

Source code in pyecoacc/util/analytics.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def model_analytics_cv(X, y, model, cv=5, cv_method="stratified", individuals=None,
                       random_state=42):
    """Computes model analytics table.

    Args:
        X (np.array): feature matrix
        y (np.array): labels vector
        model (Pipeline or Estimator): model to evaluate
        cv (int, optional): number of cross-validation splits. Defaults to 5.
        cv_method (str, optional): cross-validation method. Options include stratified, animal-groups, and LOIO. Defaults to "stratified".
        individuals (np.array, optional): individual identifiers for grouping. Defaults to None.
        random_state (int, optional): random state for reproducibility. Defaults to 42.


    Returns:
        overall_accuracy (DataFrame): overall accuracy per CV split
        mean_report (DataFrame): mean classification report across CV splits
        std_report (DataFrame): standard deviation of classification report across CV splits
        splits_output (dict): detailed classification reports per CV split
    """
    splits_output = dict()
    overall_accuracy = dict()

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    split_indices = None

    if cv_method == "stratified":
        cross_val_spliter = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
        split_indices = cross_val_spliter.split(X, y)

    elif cv_method == "animal-groups":
        cross_val_splitter = GroupKFold(n_splits=cv)
        split_indices = cross_val_splitter.split(X, y, individuals)     

    elif cv_method == "LOIO":
        cross_val_splitter = LeaveOneGroupOut()
        split_indices = cross_val_splitter.split(X, y, individuals)

    else:
        raise ValueError(f"Unsupported cross-validation method: {cv_method}")

    for i, (train_index, test_index) in enumerate(split_indices):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_encoded[train_index], y_encoded[test_index]

        y_hat = model.fit(X_train, y_train).predict(X_test)

        group_name = None
        if cv_method in ("stratified", "animal-groups"):
            group_name = i + 1
        elif cv_method == "LOIO":
            group_name = np.unique(individuals[test_index])[0]

        overall_accuracy[f"split-{group_name}"] = (y_hat == y_test).mean()

        report = classification_report(le.inverse_transform(y_test), le.inverse_transform(y_hat),
                                       labels=le.classes_,
                                       output_dict=True,
                                       zero_division=0)
        report = pd.DataFrame(report)
        report.drop("accuracy", axis=1, inplace=True)
        splits_output[f"split-{group_name}"] = report

    mean_report = pd.concat(splits_output.values()).groupby(level=0).mean()
    std_report = pd.concat(splits_output.values()).groupby(level=0).std()

    return overall_accuracy, mean_report, std_report, splits_output

pyecoacc.util.time_budget

compute_time_budget(raw_acc, clf, cm=None, apply_cm_correction=True)

Use a trained classifier to compute a time budget.

Parameters:

Name Type Description Default
raw_acc array

the raw accelerometer data to compute the time budget from

required
clf Pipeline

the trained classifier

required
cm DataFrame

the confusion matrix used for correction. Defaults to None.

None
apply_cm_correction bool

If False, no correction is applied. Defaults to True.

True

Returns:

Name Type Description
budget Series

the time budget

Source code in pyecoacc/util/time_budget.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def compute_time_budget(raw_acc, clf, cm=None, apply_cm_correction=True):
    """Use a trained classifier to compute a time budget. 

    Args:
        raw_acc (np.array): the raw accelerometer data to compute the time budget from
        clf (Pipeline): the trained classifier
        cm (pd.DataFrame, optional): the confusion matrix used for correction. Defaults to None.
        apply_cm_correction (bool, optional): If False, no correction is applied. Defaults to True.


    Returns:
        budget (pd.Series): the time budget
    """

    if apply_cm_correction and cm is None:
        raise ValueError("Confusion matrix must be provided if apply_cm_correction=True")

    y_hat = clf.predict(raw_acc)
    tb = pd.Series(y_hat).value_counts(normalize=True)

    if apply_cm_correction:
        tb = confusion_matrix_correction(tb[cm.index], cm)

    return tb

confusion_matrix_correction(budget, cm)

Apply the confusion matrix correction for time budgets.

Parameters:

Name Type Description Default
budget Series

the raw time budget

required
cm DataFrame

the confusion matrix used for correction

required

Returns:

Name Type Description
corrected_budget Series

the corrected time budget

Source code in pyecoacc/util/time_budget.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
def confusion_matrix_correction(budget, cm):
    """Apply the confusion matrix correction for time budgets. 

    Args:
        budget (pd.Series): the raw time budget
        cm (pd.DataFrame): the confusion matrix used for correction

    Returns:
        corrected_budget (pd.Series): the corrected time budget
    """
    corrected = np.dot(np.linalg.inv(cm).T, budget)
    return pd.Series(corrected, index=budget.index)

options: show_submodules: false members: - confusion_matrix_correction - compute_time_budget filters: ["!^"]

pyecoacc.util.training

train_compute_cm(model, X, y, cm_estimation_percent=0.2, round=2)

summary

Parameters:

Name Type Description Default
model Pipeline or Estimator

the model to train and evaluate

required
X array

featuers to train and evaluate on

required
y array

labels to train and evaluate on

required
cm_estimation_percent float

the fraction of data used for confusion matrix estimation. Defaults to .2.

0.2
round int

the number of decimal places to round the confusion matrix. Defaults to 2.

2

Returns:

Name Type Description
confusion_matrix DataFrame

the estimated confusion matrix

Source code in pyecoacc/util/training.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def train_compute_cm(model, X, y, cm_estimation_percent=.2, round=2):
    """_summary_

    Args:
        model (Pipeline or Estimator): the model to train and evaluate
        X (np.array): featuers to train and evaluate on
        y (np.array): labels to train and evaluate on
        cm_estimation_percent (float, optional): the fraction of data used for confusion matrix estimation. Defaults to .2.
        round (int, optional): the number of decimal places to round the confusion matrix. Defaults to 2.

    Returns:
        confusion_matrix (pd.DataFrame): the estimated confusion matrix
    """
    # split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cm_estimation_percent)

    # train
    model.fit(X_train, y_train)

    # estimate
    y_hat = model.predict(X_test)
    cm = compute_confusion_matrix(y_test, y_hat, round=round)

    return cm

options: show_submodules: false members: - train_compute_cm filters: ["!^"]

pyecoacc.util.preprocessing

long_to_wide_multi_animal(df, id_col='AnimalID', segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True)

Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for multiple animals with continuous data for each one.

Assumes (approximately) constant sampling interval; drops the last partial segment. Assumes no gaps in the data of each animal.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
segment_duration str

encodes the duration of each segment. Defaults to "1s".

'1s'
xcol str

the name of the column for the X acceleration axis. Defaults to "accX".

'accX'
ycol str

the name of the column for the Y acceleration axis. Defaults to "accY".

'accY'
zcol str

the name of the column for the Z acceleration axis. Defaults to "accZ".

'accZ'
timestamp_col str

the name of the column for the timestamp. Defaults to "Timestamp".

'Timestamp'
sort_by_time bool

if True, sort the data by timestamp. Defaults to True.

True
id_col str

description. Defaults to "AnimalID".

'AnimalID'

Returns:

Name Type Description
segment_table DataFrame

a wide dataframe with one row per non-overlapping segment.

Source code in pyecoacc/util/preprocessing.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def long_to_wide_multi_animal(
    df: pd.DataFrame, 
    id_col: str = "AnimalID",
    segment_duration: str = "1s",
    xcol: str = "accX",
    ycol: str = "accY",
    zcol: str = "accZ",
    timestamp_col: str = "Timestamp",
    sort_by_time: bool = True,
) -> pd.DataFrame:
    """Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for multiple animals with continuous data for each one.     


    Assumes (approximately) constant sampling interval; drops the last partial segment.
    Assumes no gaps in the data of each animal. 

    Args:
        df (pd.DataFrame): input dataframe
        segment_duration (str, optional): encodes the duration of each segment. Defaults to "1s".
        xcol (str, optional): the name of the column for the X acceleration axis. Defaults to "accX".
        ycol (str, optional): the name of the column for the Y acceleration axis. Defaults to "accY".
        zcol (str, optional): the name of the column for the Z acceleration axis. Defaults to "accZ".
        timestamp_col (str, optional): the name of the column for the timestamp. Defaults to "Timestamp".
        sort_by_time (bool, optional): if True, sort the data by timestamp. Defaults to True.
        id_col (str, optional): _description_. Defaults to "AnimalID".

    Returns:
        segment_table (pd.DataFrame): a wide dataframe with one row per non-overlapping segment.
    """

    all_animals = [] 

    for animal_id, animal in df.groupby(id_col):
        segments = long_to_wide_segments(animal, 
                                         segment_duration=segment_duration,
                                         xcol=xcol,
                                         ycol=ycol,
                                         zcol=zcol,
                                         timestamp_col=timestamp_col,
                                         sort_by_time=sort_by_time)
        segments.index = pd.MultiIndex.from_product([animal_id, segments.index])
        all_animals.append(segments)    

    out = pd.concat(all_animals, axis=0)
    return out

long_to_wide_segments(df, segment_duration='1s', xcol='accX', ycol='accY', zcol='accZ', timestamp_col='Timestamp', sort_by_time=True)

Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for a single animal with continuous data.

The input format uses the original long-shape columns: X, Y, Z, timestamp. Assumes (approximately) constant sampling interval; drops the last partial segment. Assumes no gaps.

We thank an anonymous reviewer for contributing this function.

Parameters:

Name Type Description Default
df DataFrame

input dataframe

required
segment_duration str

encodes the duration of each segment. Defaults to "1s".

'1s'
xcol str

the name of the column for the X acceleration axis. Defaults to "accX".

'accX'
ycol str

the name of the column for the Y acceleration axis. Defaults to "accY".

'accY'
zcol str

the name of the column for the Z acceleration axis. Defaults to "accZ".

'accZ'
timestamp_col str

the name of the column for the timestamp. Defaults to "Timestamp".

'Timestamp'
sort_by_time bool

if True, sort the data by timestamp. Defaults to True.

True

Returns:

Name Type Description
segment_table DataFrame

a wide dataframe with one row per non-overlapping segment.

Source code in pyecoacc/util/preprocessing.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def long_to_wide_segments(
    df: pd.DataFrame, 
    segment_duration: str = "1s",
    xcol: str = "accX",
    ycol: str = "accY",
    zcol: str = "accZ",
    timestamp_col: str = "Timestamp",
    sort_by_time: bool = True,
) -> pd.DataFrame:
    """ Make a wide dataframe: one row per non-overlapping segment. Appropriate to use for a single animal with continuous data.    

    The input format uses the original long-shape columns: X, Y, Z, timestamp. Assumes (approximately) constant sampling interval; drops the last partial segment.
    Assumes no gaps. 

    We thank an anonymous reviewer for contributing this function.


    Args:
        df (pd.DataFrame): input dataframe
        segment_duration (str, optional): encodes the duration of each segment. Defaults to "1s".
        xcol (str, optional): the name of the column for the X acceleration axis. Defaults to "accX".
        ycol (str, optional): the name of the column for the Y acceleration axis. Defaults to "accY".
        zcol (str, optional): the name of the column for the Z acceleration axis. Defaults to "accZ".
        timestamp_col (str, optional): the name of the column for the timestamp. Defaults to "Timestamp".
        sort_by_time (bool, optional): if True, sort the data by timestamp. Defaults to True.


    Returns:
        segment_table (pd.DataFrame): a wide dataframe with one row per non-overlapping segment.
    """

    d = df[[timestamp_col, xcol, ycol, zcol]].copy()
    d[timestamp_col] = pd.to_datetime(d[timestamp_col], errors="coerce")
    d = d.dropna(subset=[timestamp_col])

    if sort_by_time:
        d = d.sort_values(timestamp_col, kind="mergesort").reset_index(drop=True)

    if len(d) < 2:
        raise ValueError("Need at least 2 rows to infer sampling interval")

    dt = (d[timestamp_col].iloc[1] - d[timestamp_col].iloc[0]).total_seconds()
    if dt <= 0:
        raise ValueError("Non-positive sampling interval (check timestamp ordering/duplicates)")

    seg_s = pd.to_timedelta(segment_duration).total_seconds()
    samples_per_seg = int(round(seg_s / dt))
    if samples_per_seg <= 0:
        raise ValueError("segment_duration too small for inferred sampling interval")

    acc = d[[xcol, ycol, zcol]].to_numpy()  # (N, 3)

    nseg = acc.shape[0] // samples_per_seg
    acc = acc[: nseg * samples_per_seg]

    wide = acc.reshape(nseg, samples_per_seg * 3)  # (nseg, 3*T)

    # column names: x,y,z,x.1,y.1,z.1,...
    base = ["x", "y", "z"]
    cols = []
    for i in range(samples_per_seg):
        suffix = "" if i == 0 else f".{i}"
        cols.extend([f"{b}{suffix}" for b in base])

    out = pd.DataFrame(wide, 
                       columns=cols,
                       index=d[timestamp_col].iloc[::samples_per_seg].iloc[:nseg].to_numpy())
    return out