Grevy's Zebra Time Budget Analysis: Linear Regression¶
Comparing Habitat and Herd Size Effects on Behavior
In [2]:
Copied!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
In [3]:
Copied!
# ============================================================================
# 1. DATA LOADING AND PREPROCESSING
# ============================================================================
# Load the dataset
df = pd.read_csv('data/grevystimebudgetscleaned.csv')
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())
# Check data types and missing values
print(f"\nData Types:")
print(df.dtypes)
print(f"\nMissing Values:")
print(df.isnull().sum())
# Examine categorical variables
print(f"\nUnique Habitats: {df['Habitat'].unique()}")
print(f"Unique Herd Sizes: {df['Herd Size'].unique()}")
# ============================================================================
# 1. DATA LOADING AND PREPROCESSING
# ============================================================================
# Load the dataset
df = pd.read_csv('data/grevystimebudgetscleaned.csv')
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())
# Check data types and missing values
print(f"\nData Types:")
print(df.dtypes)
print(f"\nMissing Values:")
print(df.isnull().sum())
# Examine categorical variables
print(f"\nUnique Habitats: {df['Habitat'].unique()}")
print(f"Unique Herd Sizes: {df['Herd Size'].unique()}")
Dataset Overview: Shape: (82, 9) Columns: ['Sesssion', 'mini_scene_id', 'Walk', 'Head Up', 'Graze', 'Other', 'Interval(sec)', 'Habitat', 'Herd Size'] First few rows: Sesssion mini_scene_id Walk Head Up Graze Other \ 0 1 11_01_23-DJI_0977_2 0.442609 0.417140 0.096197 0.044054 1 1 11_01_23-DJI_0977_3 0.513430 0.371257 0.066039 0.049273 2 1 11_01_23-DJI_0977_1 0.557903 0.358236 0.032945 0.050915 3 2 11_01_23-DJI_0488_9 0.016229 0.983771 0.000000 0.000000 4 2 11_01_23-DJI_0488_45 0.000000 0.745382 0.000000 0.254618 Interval(sec) Habitat Herd Size 0 193.700000 Closed Small 1 194.833333 Closed Small 2 200.333333 Closed Small 3 112.966667 Open Small 4 108.266667 Open Small Data Types: Sesssion int64 mini_scene_id object Walk float64 Head Up float64 Graze float64 Other float64 Interval(sec) float64 Habitat object Herd Size object dtype: object Missing Values: Sesssion 0 mini_scene_id 0 Walk 0 Head Up 0 Graze 0 Other 0 Interval(sec) 0 Habitat 0 Herd Size 0 dtype: int64 Unique Habitats: ['Closed' 'Open'] Unique Herd Sizes: ['Small' 'Large']
In [4]:
Copied!
# ============================================================================
# 2. EXPLORATORY DATA ANALYSIS
# ============================================================================
# Behavior variables (dependent variables)
behaviors = ['Walk', 'Head Up', 'Graze', 'Other']
# Summary statistics for behaviors by habitat and herd size
print("\n" + "="*60)
print("SUMMARY STATISTICS BY HABITAT AND HERD SIZE")
print("="*60)
for behavior in behaviors:
print(f"\n{behavior}:")
print("By Habitat:")
print(df.groupby('Habitat')[behavior].agg(['count', 'mean', 'std']).round(4))
print("\nBy Herd Size:")
print(df.groupby('Herd Size')[behavior].agg(['count', 'mean', 'std']).round(4))
# Create visualization of behavior patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Grevy\'s Zebra Behavior Patterns by Habitat and Herd Size', fontsize=16, y=0.98)
for i, behavior in enumerate(behaviors):
row, col = i // 2, i % 2
# Create subplot with both habitat and herd size
ax = axes[row, col]
# Box plot showing distribution by habitat and herd size
df_plot = df.melt(id_vars=['Habitat', 'Herd Size'],
value_vars=[behavior],
var_name='Behavior',
value_name='Proportion')
sns.boxplot(data=df_plot, x='Habitat', y='Proportion', hue='Herd Size', ax=ax)
ax.set_title(f'{behavior} by Habitat and Herd Size')
ax.set_ylabel('Proportion of Time')
plt.tight_layout()
plt.show()
# ============================================================================
# 2. EXPLORATORY DATA ANALYSIS
# ============================================================================
# Behavior variables (dependent variables)
behaviors = ['Walk', 'Head Up', 'Graze', 'Other']
# Summary statistics for behaviors by habitat and herd size
print("\n" + "="*60)
print("SUMMARY STATISTICS BY HABITAT AND HERD SIZE")
print("="*60)
for behavior in behaviors:
print(f"\n{behavior}:")
print("By Habitat:")
print(df.groupby('Habitat')[behavior].agg(['count', 'mean', 'std']).round(4))
print("\nBy Herd Size:")
print(df.groupby('Herd Size')[behavior].agg(['count', 'mean', 'std']).round(4))
# Create visualization of behavior patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Grevy\'s Zebra Behavior Patterns by Habitat and Herd Size', fontsize=16, y=0.98)
for i, behavior in enumerate(behaviors):
row, col = i // 2, i % 2
# Create subplot with both habitat and herd size
ax = axes[row, col]
# Box plot showing distribution by habitat and herd size
df_plot = df.melt(id_vars=['Habitat', 'Herd Size'],
value_vars=[behavior],
var_name='Behavior',
value_name='Proportion')
sns.boxplot(data=df_plot, x='Habitat', y='Proportion', hue='Herd Size', ax=ax)
ax.set_title(f'{behavior} by Habitat and Herd Size')
ax.set_ylabel('Proportion of Time')
plt.tight_layout()
plt.show()
============================================================ SUMMARY STATISTICS BY HABITAT AND HERD SIZE ============================================================ Walk: By Habitat: count mean std Habitat Closed 32 0.1000 0.1557 Open 50 0.2493 0.2701 By Herd Size: count mean std Herd Size Large 59 0.0992 0.1370 Small 23 0.4266 0.2936 Head Up: By Habitat: count mean std Habitat Closed 32 0.2086 0.2867 Open 50 0.2657 0.2907 By Herd Size: count mean std Herd Size Large 59 0.1539 0.2281 Small 23 0.4731 0.3050 Graze: By Habitat: count mean std Habitat Closed 32 0.6148 0.3936 Open 50 0.4085 0.3871 By Herd Size: count mean std Herd Size Large 59 0.6739 0.3153 Small 23 0.0149 0.0314 Other: By Habitat: count mean std Habitat Closed 32 0.0765 0.1507 Open 50 0.0765 0.1391 By Herd Size: count mean std Herd Size Large 59 0.0731 0.1540 Small 23 0.0854 0.1114
In [11]:
Copied!
# ============================================================================
# 3. LINEAR REGRESSION ANALYSIS
# ============================================================================
# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()
df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])
# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))
print("\n" + "="*60)
print("ENCODING MAPPINGS")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")
# Prepare predictors
X = df[['Habitat_encoded', 'HerdSize_encoded']]
feature_names = ['Habitat', 'Herd Size']
# Store results for comparison
regression_results = {}
print("\n" + "="*60)
print("LINEAR REGRESSION RESULTS")
print("="*60)
# save significance levels
significance_levels = []
for behavior in behaviors:
print(f"\n{'-'*40}")
print(f"BEHAVIOR: {behavior}")
print(f"{'-'*40}")
# Fit linear regression
y = df[behavior]
model = LinearRegression()
model.fit(X, y)
# Predictions and R²
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
# Store results
regression_results[behavior] = {
'coefficients': model.coef_,
'intercept': model.intercept_,
'r2': r2,
'model': model
}
print(f"R² Score: {r2:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print("Coefficients:")
for i, (coef, feature) in enumerate(zip(model.coef_, feature_names)):
print(f" {feature}: {coef:.4f}")
# Statistical significance testing
# Calculate t-statistics and p-values
n = len(y)
k = len(model.coef_)
# Residual sum of squares
y_pred = model.predict(X)
residuals = y - y_pred
rss = np.sum(residuals**2)
# Mean squared error
mse = rss / (n - k - 1)
# Standard errors of coefficients
X_with_intercept = np.column_stack([np.ones(n), X])
try:
cov_matrix = mse * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
se_coefs = np.sqrt(np.diag(cov_matrix))[1:] # Exclude intercept
# t-statistics and p-values
t_stats = model.coef_ / se_coefs
p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k - 1))
print("Statistical Significance:")
for i, (feature, t_stat, p_val) in enumerate(zip(feature_names, t_stats, p_values)):
significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
print(f" {feature}: t={t_stat:.3f}, p={p_val:.4f} {significance}")
# save t_stats and p_values for later use
significance_levels.append((behavior, feature_names, t_stats, p_values))
except np.linalg.LinAlgError:
print("Could not calculate statistical significance (singular matrix)")
# ============================================================================
# 3. LINEAR REGRESSION ANALYSIS
# ============================================================================
# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()
df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])
# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))
print("\n" + "="*60)
print("ENCODING MAPPINGS")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")
# Prepare predictors
X = df[['Habitat_encoded', 'HerdSize_encoded']]
feature_names = ['Habitat', 'Herd Size']
# Store results for comparison
regression_results = {}
print("\n" + "="*60)
print("LINEAR REGRESSION RESULTS")
print("="*60)
# save significance levels
significance_levels = []
for behavior in behaviors:
print(f"\n{'-'*40}")
print(f"BEHAVIOR: {behavior}")
print(f"{'-'*40}")
# Fit linear regression
y = df[behavior]
model = LinearRegression()
model.fit(X, y)
# Predictions and R²
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
# Store results
regression_results[behavior] = {
'coefficients': model.coef_,
'intercept': model.intercept_,
'r2': r2,
'model': model
}
print(f"R² Score: {r2:.4f}")
print(f"Intercept: {model.intercept_:.4f}")
print("Coefficients:")
for i, (coef, feature) in enumerate(zip(model.coef_, feature_names)):
print(f" {feature}: {coef:.4f}")
# Statistical significance testing
# Calculate t-statistics and p-values
n = len(y)
k = len(model.coef_)
# Residual sum of squares
y_pred = model.predict(X)
residuals = y - y_pred
rss = np.sum(residuals**2)
# Mean squared error
mse = rss / (n - k - 1)
# Standard errors of coefficients
X_with_intercept = np.column_stack([np.ones(n), X])
try:
cov_matrix = mse * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
se_coefs = np.sqrt(np.diag(cov_matrix))[1:] # Exclude intercept
# t-statistics and p-values
t_stats = model.coef_ / se_coefs
p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k - 1))
print("Statistical Significance:")
for i, (feature, t_stat, p_val) in enumerate(zip(feature_names, t_stats, p_values)):
significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
print(f" {feature}: t={t_stat:.3f}, p={p_val:.4f} {significance}")
# save t_stats and p_values for later use
significance_levels.append((behavior, feature_names, t_stats, p_values))
except np.linalg.LinAlgError:
print("Could not calculate statistical significance (singular matrix)")
============================================================ ENCODING MAPPINGS ============================================================ Habitat encoding: {0: 'Closed', 1: 'Open'} Herd Size encoding: {0: 'Large', 1: 'Small'} ============================================================ LINEAR REGRESSION RESULTS ============================================================ ---------------------------------------- BEHAVIOR: Walk ---------------------------------------- R² Score: 0.4022 Intercept: 0.0521 Coefficients: Habitat: 0.0868 Herd Size: 0.3066 Statistical Significance: Habitat: t=1.971, p=0.0523 Herd Size: t=6.409, p=0.0000 *** ---------------------------------------- BEHAVIOR: Head Up ---------------------------------------- R² Score: 0.2500 Intercept: 0.1584 Coefficients: Habitat: -0.0084 Herd Size: 0.3213 Statistical Significance: Habitat: t=-0.143, p=0.8864 Herd Size: t=5.034, p=0.0000 *** ---------------------------------------- BEHAVIOR: Graze ---------------------------------------- R² Score: 0.5621 Intercept: 0.7150 Coefficients: Habitat: -0.0757 Herd Size: -0.6408 Statistical Significance: Habitat: t=-1.216, p=0.2275 Herd Size: t=-9.479, p=0.0000 *** ---------------------------------------- BEHAVIOR: Other ---------------------------------------- R² Score: 0.0016 Intercept: 0.0745 Coefficients: Habitat: -0.0027 Herd Size: 0.0129 Statistical Significance: Habitat: t=-0.080, p=0.9368 Herd Size: t=0.355, p=0.7232
In [12]:
Copied!
significance_levels
significance_levels
Out[12]:
[('Walk', ['Habitat', 'Herd Size'], array([1.9706704 , 6.40901791]), array([5.22644921e-02, 9.83405646e-09])), ('Head Up', ['Habitat', 'Herd Size'], array([-0.14331907, 5.03400118]), array([8.86403135e-01, 2.95772428e-06])), ('Graze', ['Habitat', 'Herd Size'], array([-1.21636233, -9.47907617]), array([2.27468742e-01, 1.13242749e-14])), ('Other', ['Habitat', 'Herd Size'], array([-0.07960377, 0.35540207]), array([0.9367537 , 0.72323558]))]
In [14]:
Copied!
# print the regression results, coefficients, and statistical significance in table for manuscript
print("\n" + "="*60)
print("REGRESSION SUMMARY TABLE")
print("="*60)
summary_table = []
for behavior, results in regression_results.items():
row = {
'Behavior': behavior,
'R²': results['r2'],
'Intercept': results['intercept']
}
for i, feature in enumerate(feature_names):
row[f'Coef_{feature}'] = results['coefficients'][i]
summary_table.append(row)
for sig in significance_levels:
if sig[0] == behavior:
for i, feature in enumerate(sig[1]):
row[f't_{feature}'] = sig[2][i]
row[f'p_{feature}'] = sig[3][i]
# add checkmark for significant or not
for feature in feature_names:
p_val = row.get(f'p_{feature}', 1)
row[f'Significant_{feature}'] = '✓' if p_val < 0.05 else ''
summary_df = pd.DataFrame(summary_table)
print(summary_df.round(4))
# Save summary table to CSV
summary_df.to_csv('regression_summary.csv', index=False)
print("\nRegression summary saved to 'regression_summary.csv'")
# ============================================================================
# print the regression results, coefficients, and statistical significance in table for manuscript
print("\n" + "="*60)
print("REGRESSION SUMMARY TABLE")
print("="*60)
summary_table = []
for behavior, results in regression_results.items():
row = {
'Behavior': behavior,
'R²': results['r2'],
'Intercept': results['intercept']
}
for i, feature in enumerate(feature_names):
row[f'Coef_{feature}'] = results['coefficients'][i]
summary_table.append(row)
for sig in significance_levels:
if sig[0] == behavior:
for i, feature in enumerate(sig[1]):
row[f't_{feature}'] = sig[2][i]
row[f'p_{feature}'] = sig[3][i]
# add checkmark for significant or not
for feature in feature_names:
p_val = row.get(f'p_{feature}', 1)
row[f'Significant_{feature}'] = '✓' if p_val < 0.05 else ''
summary_df = pd.DataFrame(summary_table)
print(summary_df.round(4))
# Save summary table to CSV
summary_df.to_csv('regression_summary.csv', index=False)
print("\nRegression summary saved to 'regression_summary.csv'")
# ============================================================================
============================================================ REGRESSION SUMMARY TABLE ============================================================ Behavior R² Intercept Coef_Habitat Coef_Herd Size t_Habitat \ 0 Walk 0.4022 0.0521 0.0868 0.3066 1.9707 1 Head Up 0.2500 0.1584 -0.0084 0.3213 -0.1433 2 Graze 0.5621 0.7150 -0.0757 -0.6408 -1.2164 3 Other 0.0016 0.0745 -0.0027 0.0129 -0.0796 p_Habitat t_Herd Size p_Herd Size Significant_Habitat \ 0 0.0523 6.4090 0.0000 1 0.8864 5.0340 0.0000 2 0.2275 -9.4791 0.0000 3 0.9368 0.3554 0.7232 Significant_Herd Size 0 ✓ 1 ✓ 2 ✓ 3 Regression summary saved to 'regression_summary.csv'
In [6]:
Copied!
# ============================================================================
# 4. COEFFICIENT COMPARISON VISUALIZATION
# ============================================================================
# Create coefficient comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Linear Regression Coefficients: Habitat and Herd Size Effects on Behaviors',
fontsize=16, y=0.98)
# Prepare data for plotting
coef_data = []
for behavior in behaviors:
for i, feature in enumerate(feature_names):
coef_data.append({
'Behavior': behavior,
'Predictor': feature,
'Coefficient': regression_results[behavior]['coefficients'][i],
'R²': regression_results[behavior]['r2']
})
coef_df = pd.DataFrame(coef_data)
# Plot 1: Habitat coefficients
ax1 = axes[0, 0]
habitat_coefs = coef_df[coef_df['Predictor'] == 'Habitat']
bars1 = ax1.bar(habitat_coefs['Behavior'], habitat_coefs['Coefficient'],
color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax1.set_title('Habitat Effect on Behaviors\n(Closed=0, Open=1)')
ax1.set_ylabel('Coefficient Value')
ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax1.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar, value in zip(bars1, habitat_coefs['Coefficient']):
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')
# Plot 2: Herd Size coefficients
ax2 = axes[0, 1]
herdsize_coefs = coef_df[coef_df['Predictor'] == 'Herd Size']
bars2 = ax2.bar(herdsize_coefs['Behavior'], herdsize_coefs['Coefficient'],
color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax2.set_title('Herd Size Effect on Behaviors\n(Large=0, Small=1)')
ax2.set_ylabel('Coefficient Value')
ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax2.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar, value in zip(bars2, herdsize_coefs['Coefficient']):
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')
# Plot 3: R² values comparison
ax3 = axes[1, 0]
r2_values = [regression_results[behavior]['r2'] for behavior in behaviors]
bars3 = ax3.bar(behaviors, r2_values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax3.set_title('Model Performance (R² values)')
ax3.set_ylabel('R² Score')
ax3.set_ylim(0, max(r2_values) * 1.1)
ax3.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar, value in zip(bars3, r2_values):
height = bar.get_height()
ax3.text(bar.get_x() + bar.get_width()/2., height + 0.005,
f'{value:.3f}', ha='center', va='bottom')
# Plot 4: Coefficient comparison heatmap
ax4 = axes[1, 1]
pivot_coefs = coef_df.pivot(index='Behavior', columns='Predictor', values='Coefficient')
sns.heatmap(pivot_coefs, annot=True, cmap='RdBu_r', center=0, ax=ax4,
fmt='.3f', cbar_kws={'label': 'Coefficient Value'})
ax4.set_title('Coefficient Heatmap')
plt.tight_layout()
plt.show()
# ============================================================================
# 4. COEFFICIENT COMPARISON VISUALIZATION
# ============================================================================
# Create coefficient comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Linear Regression Coefficients: Habitat and Herd Size Effects on Behaviors',
fontsize=16, y=0.98)
# Prepare data for plotting
coef_data = []
for behavior in behaviors:
for i, feature in enumerate(feature_names):
coef_data.append({
'Behavior': behavior,
'Predictor': feature,
'Coefficient': regression_results[behavior]['coefficients'][i],
'R²': regression_results[behavior]['r2']
})
coef_df = pd.DataFrame(coef_data)
# Plot 1: Habitat coefficients
ax1 = axes[0, 0]
habitat_coefs = coef_df[coef_df['Predictor'] == 'Habitat']
bars1 = ax1.bar(habitat_coefs['Behavior'], habitat_coefs['Coefficient'],
color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax1.set_title('Habitat Effect on Behaviors\n(Closed=0, Open=1)')
ax1.set_ylabel('Coefficient Value')
ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax1.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar, value in zip(bars1, habitat_coefs['Coefficient']):
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')
# Plot 2: Herd Size coefficients
ax2 = axes[0, 1]
herdsize_coefs = coef_df[coef_df['Predictor'] == 'Herd Size']
bars2 = ax2.bar(herdsize_coefs['Behavior'], herdsize_coefs['Coefficient'],
color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax2.set_title('Herd Size Effect on Behaviors\n(Large=0, Small=1)')
ax2.set_ylabel('Coefficient Value')
ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax2.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar, value in zip(bars2, herdsize_coefs['Coefficient']):
height = bar.get_height()
ax2.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')
# Plot 3: R² values comparison
ax3 = axes[1, 0]
r2_values = [regression_results[behavior]['r2'] for behavior in behaviors]
bars3 = ax3.bar(behaviors, r2_values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax3.set_title('Model Performance (R² values)')
ax3.set_ylabel('R² Score')
ax3.set_ylim(0, max(r2_values) * 1.1)
ax3.tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar, value in zip(bars3, r2_values):
height = bar.get_height()
ax3.text(bar.get_x() + bar.get_width()/2., height + 0.005,
f'{value:.3f}', ha='center', va='bottom')
# Plot 4: Coefficient comparison heatmap
ax4 = axes[1, 1]
pivot_coefs = coef_df.pivot(index='Behavior', columns='Predictor', values='Coefficient')
sns.heatmap(pivot_coefs, annot=True, cmap='RdBu_r', center=0, ax=ax4,
fmt='.3f', cbar_kws={'label': 'Coefficient Value'})
ax4.set_title('Coefficient Heatmap')
plt.tight_layout()
plt.show()
In [7]:
Copied!
# ============================================================================
# 5. DETAILED INTERPRETATION
# ============================================================================
print("\n" + "="*60)
print("INTERPRETATION OF RESULTS")
print("="*60)
print("\nCOEFFICIENT INTERPRETATION:")
print("-" * 30)
print("Habitat coefficients (Closed=0, Open=1):")
print(" Positive coefficient = behavior increases in Open habitat")
print(" Negative coefficient = behavior decreases in Open habitat")
print("\nHerd Size coefficients (Large=0, Small=1):")
print(" Positive coefficient = behavior increases in Small herds")
print(" Negative coefficient = behavior decreases in Small herds")
print("\nKEY FINDINGS:")
print("-" * 15)
for behavior in behaviors:
habitat_coef = regression_results[behavior]['coefficients'][0]
herdsize_coef = regression_results[behavior]['coefficients'][1]
r2 = regression_results[behavior]['r2']
print(f"\n{behavior}:")
print(f" Model explains {r2*100:.1f}% of variance")
# Habitat effect
if abs(habitat_coef) > 0.01: # Threshold for meaningful effect
direction = "higher" if habitat_coef > 0 else "lower"
print(f" Habitat: {direction} in Open vs Closed ({habitat_coef:+.3f})")
else:
print(f" Habitat: minimal effect ({habitat_coef:+.3f})")
# Herd size effect
if abs(herdsize_coef) > 0.01:
direction = "higher" if herdsize_coef > 0 else "lower"
print(f" Herd Size: {direction} in Small vs Large herds ({herdsize_coef:+.3f})")
else:
print(f" Herd Size: minimal effect ({herdsize_coef:+.3f})")
# ============================================================================
# 5. DETAILED INTERPRETATION
# ============================================================================
print("\n" + "="*60)
print("INTERPRETATION OF RESULTS")
print("="*60)
print("\nCOEFFICIENT INTERPRETATION:")
print("-" * 30)
print("Habitat coefficients (Closed=0, Open=1):")
print(" Positive coefficient = behavior increases in Open habitat")
print(" Negative coefficient = behavior decreases in Open habitat")
print("\nHerd Size coefficients (Large=0, Small=1):")
print(" Positive coefficient = behavior increases in Small herds")
print(" Negative coefficient = behavior decreases in Small herds")
print("\nKEY FINDINGS:")
print("-" * 15)
for behavior in behaviors:
habitat_coef = regression_results[behavior]['coefficients'][0]
herdsize_coef = regression_results[behavior]['coefficients'][1]
r2 = regression_results[behavior]['r2']
print(f"\n{behavior}:")
print(f" Model explains {r2*100:.1f}% of variance")
# Habitat effect
if abs(habitat_coef) > 0.01: # Threshold for meaningful effect
direction = "higher" if habitat_coef > 0 else "lower"
print(f" Habitat: {direction} in Open vs Closed ({habitat_coef:+.3f})")
else:
print(f" Habitat: minimal effect ({habitat_coef:+.3f})")
# Herd size effect
if abs(herdsize_coef) > 0.01:
direction = "higher" if herdsize_coef > 0 else "lower"
print(f" Herd Size: {direction} in Small vs Large herds ({herdsize_coef:+.3f})")
else:
print(f" Herd Size: minimal effect ({herdsize_coef:+.3f})")
============================================================ INTERPRETATION OF RESULTS ============================================================ COEFFICIENT INTERPRETATION: ------------------------------ Habitat coefficients (Closed=0, Open=1): Positive coefficient = behavior increases in Open habitat Negative coefficient = behavior decreases in Open habitat Herd Size coefficients (Large=0, Small=1): Positive coefficient = behavior increases in Small herds Negative coefficient = behavior decreases in Small herds KEY FINDINGS: --------------- Walk: Model explains 40.2% of variance Habitat: higher in Open vs Closed (+0.087) Herd Size: higher in Small vs Large herds (+0.307) Head Up: Model explains 25.0% of variance Habitat: minimal effect (-0.008) Herd Size: higher in Small vs Large herds (+0.321) Graze: Model explains 56.2% of variance Habitat: lower in Open vs Closed (-0.076) Herd Size: lower in Small vs Large herds (-0.641) Other: Model explains 0.2% of variance Habitat: minimal effect (-0.003) Herd Size: higher in Small vs Large herds (+0.013)
In [8]:
Copied!
# ============================================================================
# 6. RESIDUAL ANALYSIS
# ============================================================================
print("\n" + "="*60)
print("RESIDUAL ANALYSIS")
print("="*60)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Residual Analysis for Linear Regression Models', fontsize=16, y=0.98)
for i, behavior in enumerate(behaviors):
ax = axes[i//2, i%2]
# Get predictions and residuals
model = regression_results[behavior]['model']
y_true = df[behavior]
y_pred = model.predict(X)
residuals = y_true - y_pred
# Residuals vs Predicted plot
ax.scatter(y_pred, residuals, alpha=0.6)
ax.axhline(y=0, color='red', linestyle='--')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Residuals')
ax.set_title(f'{behavior} - Residuals vs Predicted')
# Add trend line
z = np.polyfit(y_pred, residuals, 1)
p = np.poly1d(z)
ax.plot(sorted(y_pred), p(sorted(y_pred)), "r--", alpha=0.3)
plt.tight_layout()
plt.show()
print("\nModel assumptions check:")
print("- Look for random scatter around y=0 in residual plots")
print("- Non-random patterns may indicate model inadequacy")
print("- Consider transformations if patterns are evident")
# ============================================================================
# 6. RESIDUAL ANALYSIS
# ============================================================================
print("\n" + "="*60)
print("RESIDUAL ANALYSIS")
print("="*60)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Residual Analysis for Linear Regression Models', fontsize=16, y=0.98)
for i, behavior in enumerate(behaviors):
ax = axes[i//2, i%2]
# Get predictions and residuals
model = regression_results[behavior]['model']
y_true = df[behavior]
y_pred = model.predict(X)
residuals = y_true - y_pred
# Residuals vs Predicted plot
ax.scatter(y_pred, residuals, alpha=0.6)
ax.axhline(y=0, color='red', linestyle='--')
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Residuals')
ax.set_title(f'{behavior} - Residuals vs Predicted')
# Add trend line
z = np.polyfit(y_pred, residuals, 1)
p = np.poly1d(z)
ax.plot(sorted(y_pred), p(sorted(y_pred)), "r--", alpha=0.3)
plt.tight_layout()
plt.show()
print("\nModel assumptions check:")
print("- Look for random scatter around y=0 in residual plots")
print("- Non-random patterns may indicate model inadequacy")
print("- Consider transformations if patterns are evident")
============================================================ RESIDUAL ANALYSIS ============================================================
Model assumptions check: - Look for random scatter around y=0 in residual plots - Non-random patterns may indicate model inadequacy - Consider transformations if patterns are evident
In [9]:
Copied!
# ============================================================================
# 7. SUMMARY TABLE
# ============================================================================
print("\n" + "="*60)
print("SUMMARY TABLE")
print("="*60)
summary_df = pd.DataFrame({
'Behavior': behaviors,
'R²': [regression_results[b]['r2'] for b in behaviors],
'Habitat_Coef': [regression_results[b]['coefficients'][0] for b in behaviors],
'HerdSize_Coef': [regression_results[b]['coefficients'][1] for b in behaviors],
'Intercept': [regression_results[b]['intercept'] for b in behaviors]
})
print(summary_df.round(4).to_string(index=False))
# Save results to CSV
summary_df.to_csv('regression_results_summary.csv', index=False)
print(f"\nResults saved to 'regression_results_summary.csv'")
print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)
# ============================================================================
# 7. SUMMARY TABLE
# ============================================================================
print("\n" + "="*60)
print("SUMMARY TABLE")
print("="*60)
summary_df = pd.DataFrame({
'Behavior': behaviors,
'R²': [regression_results[b]['r2'] for b in behaviors],
'Habitat_Coef': [regression_results[b]['coefficients'][0] for b in behaviors],
'HerdSize_Coef': [regression_results[b]['coefficients'][1] for b in behaviors],
'Intercept': [regression_results[b]['intercept'] for b in behaviors]
})
print(summary_df.round(4).to_string(index=False))
# Save results to CSV
summary_df.to_csv('regression_results_summary.csv', index=False)
print(f"\nResults saved to 'regression_results_summary.csv'")
print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)
============================================================ SUMMARY TABLE ============================================================ Behavior R² Habitat_Coef HerdSize_Coef Intercept Walk 0.4022 0.0868 0.3066 0.0521 Head Up 0.2500 -0.0084 0.3213 0.1584 Graze 0.5621 -0.0757 -0.6408 0.7150 Other 0.0016 -0.0027 0.0129 0.0745 Results saved to 'regression_results_summary.csv' ============================================================ ANALYSIS COMPLETE ============================================================
Analysis of interactions between habitat and herd size¶
In [10]:
Copied!
# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()
df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])
# Create interaction term
df['Habitat_HerdSize_interaction'] = df['Habitat_encoded'] * df['HerdSize_encoded']
# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))
print("\n" + "="*60)
print("ENCODING AND INTERACTION SETUP")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")
print("\nInteraction term interpretation:")
print(" 0: Closed + Large (baseline)")
print(" 0: Open + Large (habitat effect only)")
print(" 0: Closed + Small (herd size effect only)")
print(" 1: Open + Small (habitat + herd size + interaction)")
# Check distribution of combinations
print("\nDistribution of combinations:")
combo_counts = df.groupby(['Habitat', 'Herd Size']).size()
print(combo_counts)
# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()
df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])
# Create interaction term
df['Habitat_HerdSize_interaction'] = df['Habitat_encoded'] * df['HerdSize_encoded']
# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))
print("\n" + "="*60)
print("ENCODING AND INTERACTION SETUP")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")
print("\nInteraction term interpretation:")
print(" 0: Closed + Large (baseline)")
print(" 0: Open + Large (habitat effect only)")
print(" 0: Closed + Small (herd size effect only)")
print(" 1: Open + Small (habitat + herd size + interaction)")
# Check distribution of combinations
print("\nDistribution of combinations:")
combo_counts = df.groupby(['Habitat', 'Herd Size']).size()
print(combo_counts)
============================================================ ENCODING AND INTERACTION SETUP ============================================================ Habitat encoding: {0: 'Closed', 1: 'Open'} Herd Size encoding: {0: 'Large', 1: 'Small'} Interaction term interpretation: 0: Closed + Large (baseline) 0: Open + Large (habitat effect only) 0: Closed + Small (herd size effect only) 1: Open + Small (habitat + herd size + interaction) Distribution of combinations: Habitat Herd Size Closed Large 27 Small 5 Open Large 32 Small 18 dtype: int64
In [11]:
Copied!
# ============================================================================
# 2. MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS
# ============================================================================
behaviors = ['Walk', 'Head Up', 'Graze', 'Other']
# Prepare predictors for both models
X_main = df[['Habitat_encoded', 'HerdSize_encoded']] # Main effects only
X_interaction = df[['Habitat_encoded', 'HerdSize_encoded', 'Habitat_HerdSize_interaction']] # With interaction
# Store results for comparison
main_effects_results = {}
interaction_results = {}
model_comparison = {}
print("\n" + "="*60)
print("MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS")
print("="*60)
for behavior in behaviors:
print(f"\n{'-'*50}")
print(f"BEHAVIOR: {behavior}")
print(f"{'-'*50}")
y = df[behavior]
# ===================
# MAIN EFFECTS MODEL
# ===================
model_main = LinearRegression()
model_main.fit(X_main, y)
y_pred_main = model_main.predict(X_main)
r2_main = r2_score(y, y_pred_main)
main_effects_results[behavior] = {
'model': model_main,
'r2': r2_main,
'coefficients': model_main.coef_,
'intercept': model_main.intercept_,
'predictions': y_pred_main
}
# ===================
# INTERACTION MODEL
# ===================
model_interaction = LinearRegression()
model_interaction.fit(X_interaction, y)
y_pred_interaction = model_interaction.predict(X_interaction)
r2_interaction = r2_score(y, y_pred_interaction)
interaction_results[behavior] = {
'model': model_interaction,
'r2': r2_interaction,
'coefficients': model_interaction.coef_,
'intercept': model_interaction.intercept_,
'predictions': y_pred_interaction
}
# ===================
# F-TEST FOR INTERACTION SIGNIFICANCE
# ===================
# Calculate F-statistic to test if interaction model is significantly better
n = len(y)
# Residual Sum of Squares
rss_main = np.sum((y - y_pred_main)**2)
rss_interaction = np.sum((y - y_pred_interaction)**2)
# Degrees of freedom
df_main = n - X_main.shape[1] - 1 # n - p - 1
df_interaction = n - X_interaction.shape[1] - 1
# F-statistic for interaction term
f_stat = ((rss_main - rss_interaction) / 1) / (rss_interaction / df_interaction)
p_value = 1 - stats.f.cdf(f_stat, 1, df_interaction)
# Store comparison results
model_comparison[behavior] = {
'r2_improvement': r2_interaction - r2_main,
'f_statistic': f_stat,
'p_value': p_value,
'interaction_significant': p_value < 0.05
}
# ===================
# DISPLAY RESULTS
# ===================
print(f"\nMAIN EFFECTS MODEL:")
print(f" R² = {r2_main:.4f}")
print(f" Intercept: {model_main.intercept_:.4f}")
print(f" Habitat coef: {model_main.coef_[0]:.4f}")
print(f" Herd Size coef: {model_main.coef_[1]:.4f}")
print(f"\nINTERACTION MODEL:")
print(f" R² = {r2_interaction:.4f}")
print(f" Intercept: {model_interaction.intercept_:.4f}")
print(f" Habitat coef: {model_interaction.coef_[0]:.4f}")
print(f" Herd Size coef: {model_interaction.coef_[1]:.4f}")
print(f" Interaction coef: {model_interaction.coef_[2]:.4f}")
print(f"\nMODEL COMPARISON:")
print(f" R² improvement: {r2_interaction - r2_main:.4f}")
print(f" F-statistic: {f_stat:.3f}")
print(f" p-value: {p_value:.4f}")
significance = ""
if p_value < 0.001:
significance = "*** (highly significant)"
elif p_value < 0.01:
significance = "** (very significant)"
elif p_value < 0.05:
significance = "* (significant)"
else:
significance = "(not significant)"
print(f" Interaction effect: {significance}")
# ============================================================================
# 2. MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS
# ============================================================================
behaviors = ['Walk', 'Head Up', 'Graze', 'Other']
# Prepare predictors for both models
X_main = df[['Habitat_encoded', 'HerdSize_encoded']] # Main effects only
X_interaction = df[['Habitat_encoded', 'HerdSize_encoded', 'Habitat_HerdSize_interaction']] # With interaction
# Store results for comparison
main_effects_results = {}
interaction_results = {}
model_comparison = {}
print("\n" + "="*60)
print("MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS")
print("="*60)
for behavior in behaviors:
print(f"\n{'-'*50}")
print(f"BEHAVIOR: {behavior}")
print(f"{'-'*50}")
y = df[behavior]
# ===================
# MAIN EFFECTS MODEL
# ===================
model_main = LinearRegression()
model_main.fit(X_main, y)
y_pred_main = model_main.predict(X_main)
r2_main = r2_score(y, y_pred_main)
main_effects_results[behavior] = {
'model': model_main,
'r2': r2_main,
'coefficients': model_main.coef_,
'intercept': model_main.intercept_,
'predictions': y_pred_main
}
# ===================
# INTERACTION MODEL
# ===================
model_interaction = LinearRegression()
model_interaction.fit(X_interaction, y)
y_pred_interaction = model_interaction.predict(X_interaction)
r2_interaction = r2_score(y, y_pred_interaction)
interaction_results[behavior] = {
'model': model_interaction,
'r2': r2_interaction,
'coefficients': model_interaction.coef_,
'intercept': model_interaction.intercept_,
'predictions': y_pred_interaction
}
# ===================
# F-TEST FOR INTERACTION SIGNIFICANCE
# ===================
# Calculate F-statistic to test if interaction model is significantly better
n = len(y)
# Residual Sum of Squares
rss_main = np.sum((y - y_pred_main)**2)
rss_interaction = np.sum((y - y_pred_interaction)**2)
# Degrees of freedom
df_main = n - X_main.shape[1] - 1 # n - p - 1
df_interaction = n - X_interaction.shape[1] - 1
# F-statistic for interaction term
f_stat = ((rss_main - rss_interaction) / 1) / (rss_interaction / df_interaction)
p_value = 1 - stats.f.cdf(f_stat, 1, df_interaction)
# Store comparison results
model_comparison[behavior] = {
'r2_improvement': r2_interaction - r2_main,
'f_statistic': f_stat,
'p_value': p_value,
'interaction_significant': p_value < 0.05
}
# ===================
# DISPLAY RESULTS
# ===================
print(f"\nMAIN EFFECTS MODEL:")
print(f" R² = {r2_main:.4f}")
print(f" Intercept: {model_main.intercept_:.4f}")
print(f" Habitat coef: {model_main.coef_[0]:.4f}")
print(f" Herd Size coef: {model_main.coef_[1]:.4f}")
print(f"\nINTERACTION MODEL:")
print(f" R² = {r2_interaction:.4f}")
print(f" Intercept: {model_interaction.intercept_:.4f}")
print(f" Habitat coef: {model_interaction.coef_[0]:.4f}")
print(f" Herd Size coef: {model_interaction.coef_[1]:.4f}")
print(f" Interaction coef: {model_interaction.coef_[2]:.4f}")
print(f"\nMODEL COMPARISON:")
print(f" R² improvement: {r2_interaction - r2_main:.4f}")
print(f" F-statistic: {f_stat:.3f}")
print(f" p-value: {p_value:.4f}")
significance = ""
if p_value < 0.001:
significance = "*** (highly significant)"
elif p_value < 0.01:
significance = "** (very significant)"
elif p_value < 0.05:
significance = "* (significant)"
else:
significance = "(not significant)"
print(f" Interaction effect: {significance}")
============================================================ MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS ============================================================ -------------------------------------------------- BEHAVIOR: Walk -------------------------------------------------- MAIN EFFECTS MODEL: R² = 0.4022 Intercept: 0.0521 Habitat coef: 0.0868 Herd Size coef: 0.3066 INTERACTION MODEL: R² = 0.4084 Intercept: 0.0409 Habitat coef: 0.1074 Herd Size coef: 0.3782 Interaction coef: -0.0978 MODEL COMPARISON: R² improvement: 0.0062 F-statistic: 0.818 p-value: 0.3685 Interaction effect: (not significant) -------------------------------------------------- BEHAVIOR: Head Up -------------------------------------------------- MAIN EFFECTS MODEL: R² = 0.2500 Intercept: 0.1584 Habitat coef: -0.0084 Herd Size coef: 0.3213 INTERACTION MODEL: R² = 0.2546 Intercept: 0.1699 Habitat coef: -0.0296 Herd Size coef: 0.2478 Interaction coef: 0.1004 MODEL COMPARISON: R² improvement: 0.0046 F-statistic: 0.482 p-value: 0.4894 Interaction effect: (not significant) -------------------------------------------------- BEHAVIOR: Graze -------------------------------------------------- MAIN EFFECTS MODEL: R² = 0.5621 Intercept: 0.7150 Habitat coef: -0.0757 Herd Size coef: -0.6408 INTERACTION MODEL: R² = 0.5628 Intercept: 0.7215 Habitat coef: -0.0877 Herd Size coef: -0.6824 Interaction coef: 0.0569 MODEL COMPARISON: R² improvement: 0.0008 F-statistic: 0.137 p-value: 0.7120 Interaction effect: (not significant) -------------------------------------------------- BEHAVIOR: Other -------------------------------------------------- MAIN EFFECTS MODEL: R² = 0.0016 Intercept: 0.0745 Habitat coef: -0.0027 Herd Size coef: 0.0129 INTERACTION MODEL: R² = 0.0082 Intercept: 0.0677 Habitat coef: 0.0099 Herd Size coef: 0.0565 Interaction coef: -0.0595 MODEL COMPARISON: R² improvement: 0.0066 F-statistic: 0.520 p-value: 0.4730 Interaction effect: (not significant)
In [12]:
Copied!
# ============================================================================
# 3. VISUALIZATION OF INTERACTION EFFECTS
# ============================================================================
# Create comprehensive interaction plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Interaction Effects: Predicted vs Observed Behavior by Habitat and Herd Size',
fontsize=16, y=0.98)
for i, behavior in enumerate(behaviors):
ax = axes[i//2, i%2]
# Create separate data for each combination
combinations = [
('Closed', 'Large'), ('Closed', 'Small'),
('Open', 'Large'), ('Open', 'Small')
]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
# Plot observed data points
for j, (habitat, herd_size) in enumerate(combinations):
mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
if mask.any():
observed_values = df[mask][behavior]
x_pos = j + np.random.normal(0, 0.05, size=len(observed_values)) # Add jitter
ax.scatter(x_pos, observed_values, alpha=0.6, color=colors[j],
s=30, label=f'{habitat}+{herd_size} (observed)')
# Plot model predictions
for j, (habitat, herd_size) in enumerate(combinations):
mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
if mask.any():
# Main effects prediction
main_pred = main_effects_results[behavior]['predictions'][mask].mean()
# Interaction model prediction
interaction_pred = interaction_results[behavior]['predictions'][mask].mean()
# Plot predictions as horizontal lines
ax.hlines(main_pred, j-0.3, j+0.3, colors='red', linestyles='--',
linewidth=2, alpha=0.8)
ax.hlines(interaction_pred, j-0.2, j+0.2, colors='blue', linestyles='-',
linewidth=3, alpha=0.8)
ax.set_title(f'{behavior}\nInteraction p-value: {model_comparison[behavior]["p_value"]:.4f}')
ax.set_ylabel('Proportion of Time')
ax.set_xticks(range(4))
ax.set_xticklabels([f'{h}\n{hs}' for h, hs in combinations], rotation=0)
# Add legend for first subplot only
if i == 0:
from matplotlib.lines import Line2D
legend_elements = [
Line2D([0], [0], linestyle='--', color='red', label='Main Effects Model'),
Line2D([0], [0], linestyle='-', color='blue', label='Interaction Model'),
Line2D([0], [0], marker='o', color='gray', linestyle='', label='Observed Data')
]
ax.legend(handles=legend_elements, loc='upper right', fontsize=8)
plt.tight_layout()
plt.show()
# ============================================================================
# 3. VISUALIZATION OF INTERACTION EFFECTS
# ============================================================================
# Create comprehensive interaction plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Interaction Effects: Predicted vs Observed Behavior by Habitat and Herd Size',
fontsize=16, y=0.98)
for i, behavior in enumerate(behaviors):
ax = axes[i//2, i%2]
# Create separate data for each combination
combinations = [
('Closed', 'Large'), ('Closed', 'Small'),
('Open', 'Large'), ('Open', 'Small')
]
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
# Plot observed data points
for j, (habitat, herd_size) in enumerate(combinations):
mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
if mask.any():
observed_values = df[mask][behavior]
x_pos = j + np.random.normal(0, 0.05, size=len(observed_values)) # Add jitter
ax.scatter(x_pos, observed_values, alpha=0.6, color=colors[j],
s=30, label=f'{habitat}+{herd_size} (observed)')
# Plot model predictions
for j, (habitat, herd_size) in enumerate(combinations):
mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
if mask.any():
# Main effects prediction
main_pred = main_effects_results[behavior]['predictions'][mask].mean()
# Interaction model prediction
interaction_pred = interaction_results[behavior]['predictions'][mask].mean()
# Plot predictions as horizontal lines
ax.hlines(main_pred, j-0.3, j+0.3, colors='red', linestyles='--',
linewidth=2, alpha=0.8)
ax.hlines(interaction_pred, j-0.2, j+0.2, colors='blue', linestyles='-',
linewidth=3, alpha=0.8)
ax.set_title(f'{behavior}\nInteraction p-value: {model_comparison[behavior]["p_value"]:.4f}')
ax.set_ylabel('Proportion of Time')
ax.set_xticks(range(4))
ax.set_xticklabels([f'{h}\n{hs}' for h, hs in combinations], rotation=0)
# Add legend for first subplot only
if i == 0:
from matplotlib.lines import Line2D
legend_elements = [
Line2D([0], [0], linestyle='--', color='red', label='Main Effects Model'),
Line2D([0], [0], linestyle='-', color='blue', label='Interaction Model'),
Line2D([0], [0], marker='o', color='gray', linestyle='', label='Observed Data')
]
ax.legend(handles=legend_elements, loc='upper right', fontsize=8)
plt.tight_layout()
plt.show()
In [13]:
Copied!
# ============================================================================
# 4. DETAILED INTERACTION INTERPRETATION
# ============================================================================
print("\n" + "="*60)
print("INTERACTION EFFECTS INTERPRETATION")
print("="*60)
# Calculate predicted values for each combination using interaction model
print("\nPredicted values for each combination (Interaction Model):")
print("-" * 55)
combination_predictions = {}
for behavior in behaviors:
print(f"\n{behavior}:")
model = interaction_results[behavior]['model']
intercept = model.intercept_
habitat_coef = model.coef_[0]
herdsize_coef = model.coef_[1]
interaction_coef = model.coef_[2]
# Calculate predictions for each combination
closed_large = intercept # 0 + 0 + 0
open_large = intercept + habitat_coef # 1 + 0 + 0
closed_small = intercept + herdsize_coef # 0 + 1 + 0
open_small = intercept + habitat_coef + herdsize_coef + interaction_coef # 1 + 1 + 1
combination_predictions[behavior] = {
'Closed+Large': closed_large,
'Open+Large': open_large,
'Closed+Small': closed_small,
'Open+Small': open_small
}
print(f" Closed + Large: {closed_large:.4f}")
print(f" Open + Large: {open_large:.4f}")
print(f" Closed + Small: {closed_small:.4f}")
print(f" Open + Small: {open_small:.4f}")
# Calculate the interaction effect
# Interaction = (Open+Small) - (Open+Large) - (Closed+Small) + (Closed+Large)
interaction_effect = open_small - open_large - closed_small + closed_large
print(f" Interaction effect: {interaction_effect:.4f}")
if abs(interaction_effect) > 0.01 and model_comparison[behavior]['interaction_significant']:
if interaction_effect > 0:
print(" → Open habitat effect is STRONGER in small herds")
else:
print(" → Open habitat effect is WEAKER in small herds")
# ============================================================================
# 4. DETAILED INTERACTION INTERPRETATION
# ============================================================================
print("\n" + "="*60)
print("INTERACTION EFFECTS INTERPRETATION")
print("="*60)
# Calculate predicted values for each combination using interaction model
print("\nPredicted values for each combination (Interaction Model):")
print("-" * 55)
combination_predictions = {}
for behavior in behaviors:
print(f"\n{behavior}:")
model = interaction_results[behavior]['model']
intercept = model.intercept_
habitat_coef = model.coef_[0]
herdsize_coef = model.coef_[1]
interaction_coef = model.coef_[2]
# Calculate predictions for each combination
closed_large = intercept # 0 + 0 + 0
open_large = intercept + habitat_coef # 1 + 0 + 0
closed_small = intercept + herdsize_coef # 0 + 1 + 0
open_small = intercept + habitat_coef + herdsize_coef + interaction_coef # 1 + 1 + 1
combination_predictions[behavior] = {
'Closed+Large': closed_large,
'Open+Large': open_large,
'Closed+Small': closed_small,
'Open+Small': open_small
}
print(f" Closed + Large: {closed_large:.4f}")
print(f" Open + Large: {open_large:.4f}")
print(f" Closed + Small: {closed_small:.4f}")
print(f" Open + Small: {open_small:.4f}")
# Calculate the interaction effect
# Interaction = (Open+Small) - (Open+Large) - (Closed+Small) + (Closed+Large)
interaction_effect = open_small - open_large - closed_small + closed_large
print(f" Interaction effect: {interaction_effect:.4f}")
if abs(interaction_effect) > 0.01 and model_comparison[behavior]['interaction_significant']:
if interaction_effect > 0:
print(" → Open habitat effect is STRONGER in small herds")
else:
print(" → Open habitat effect is WEAKER in small herds")
============================================================ INTERACTION EFFECTS INTERPRETATION ============================================================ Predicted values for each combination (Interaction Model): ------------------------------------------------------- Walk: Closed + Large: 0.0409 Open + Large: 0.1483 Closed + Small: 0.4191 Open + Small: 0.4287 Interaction effect: -0.0978 Head Up: Closed + Large: 0.1699 Open + Large: 0.1403 Closed + Small: 0.4177 Open + Small: 0.4885 Interaction effect: 0.1004 Graze: Closed + Large: 0.7215 Open + Large: 0.6337 Closed + Small: 0.0390 Open + Small: 0.0082 Interaction effect: 0.0569 Other: Closed + Large: 0.0677 Open + Large: 0.0776 Closed + Small: 0.1242 Open + Small: 0.0746 Interaction effect: -0.0595
In [14]:
Copied!
# ============================================================================
# 5. SUMMARY COMPARISON TABLE
# ============================================================================
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
summary_comparison = pd.DataFrame({
'Behavior': behaviors,
'Main_R2': [main_effects_results[b]['r2'] for b in behaviors],
'Interaction_R2': [interaction_results[b]['r2'] for b in behaviors],
'R2_Improvement': [model_comparison[b]['r2_improvement'] for b in behaviors],
'F_Statistic': [model_comparison[b]['f_statistic'] for b in behaviors],
'P_Value': [model_comparison[b]['p_value'] for b in behaviors],
'Interaction_Significant': [model_comparison[b]['interaction_significant'] for b in behaviors]
})
print(summary_comparison.round(4))
# Count significant interactions
significant_count = sum(summary_comparison['Interaction_Significant'])
print(f"\nNumber of behaviors with significant interactions: {significant_count}/4")
if significant_count > 0:
print("\nBehaviors with significant interactions:")
significant_behaviors = summary_comparison[summary_comparison['Interaction_Significant']]['Behavior'].tolist()
for behavior in significant_behaviors:
p_val = model_comparison[behavior]['p_value']
r2_imp = model_comparison[behavior]['r2_improvement']
print(f" - {behavior}: p = {p_val:.4f}, R² improvement = {r2_imp:.4f}")
# ============================================================================
# 6. RECOMMENDATION
# ============================================================================
print("\n" + "="*60)
print("RECOMMENDATION")
print("="*60)
if significant_count > 0:
print("✅ INTERACTION EFFECTS DETECTED!")
print("\nRecommendation: Use the interaction model for behaviors with significant")
print("interaction terms. The habitat and herd size effects are not simply additive")
print("- their combined effect differs from what you'd expect by just adding them up.")
print("\nThis suggests that:")
print("- The effect of habitat depends on herd size (or vice versa)")
print("- Certain combinations have unique behavioral responses")
else:
print("❌ NO SIGNIFICANT INTERACTIONS DETECTED")
print("\nRecommendation: Stick with the main effects model. The habitat and")
print("herd size effects appear to be additive - you can predict the combined")
print("effect by simply adding the individual effects together.")
# Save detailed results
summary_comparison.to_csv('interaction_model_comparison.csv', index=False)
print(f"\nDetailed results saved to 'interaction_model_comparison.csv'")
print("\n" + "="*60)
print("INTERACTION ANALYSIS COMPLETE")
print("="*60)
# ============================================================================
# 5. SUMMARY COMPARISON TABLE
# ============================================================================
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
summary_comparison = pd.DataFrame({
'Behavior': behaviors,
'Main_R2': [main_effects_results[b]['r2'] for b in behaviors],
'Interaction_R2': [interaction_results[b]['r2'] for b in behaviors],
'R2_Improvement': [model_comparison[b]['r2_improvement'] for b in behaviors],
'F_Statistic': [model_comparison[b]['f_statistic'] for b in behaviors],
'P_Value': [model_comparison[b]['p_value'] for b in behaviors],
'Interaction_Significant': [model_comparison[b]['interaction_significant'] for b in behaviors]
})
print(summary_comparison.round(4))
# Count significant interactions
significant_count = sum(summary_comparison['Interaction_Significant'])
print(f"\nNumber of behaviors with significant interactions: {significant_count}/4")
if significant_count > 0:
print("\nBehaviors with significant interactions:")
significant_behaviors = summary_comparison[summary_comparison['Interaction_Significant']]['Behavior'].tolist()
for behavior in significant_behaviors:
p_val = model_comparison[behavior]['p_value']
r2_imp = model_comparison[behavior]['r2_improvement']
print(f" - {behavior}: p = {p_val:.4f}, R² improvement = {r2_imp:.4f}")
# ============================================================================
# 6. RECOMMENDATION
# ============================================================================
print("\n" + "="*60)
print("RECOMMENDATION")
print("="*60)
if significant_count > 0:
print("✅ INTERACTION EFFECTS DETECTED!")
print("\nRecommendation: Use the interaction model for behaviors with significant")
print("interaction terms. The habitat and herd size effects are not simply additive")
print("- their combined effect differs from what you'd expect by just adding them up.")
print("\nThis suggests that:")
print("- The effect of habitat depends on herd size (or vice versa)")
print("- Certain combinations have unique behavioral responses")
else:
print("❌ NO SIGNIFICANT INTERACTIONS DETECTED")
print("\nRecommendation: Stick with the main effects model. The habitat and")
print("herd size effects appear to be additive - you can predict the combined")
print("effect by simply adding the individual effects together.")
# Save detailed results
summary_comparison.to_csv('interaction_model_comparison.csv', index=False)
print(f"\nDetailed results saved to 'interaction_model_comparison.csv'")
print("\n" + "="*60)
print("INTERACTION ANALYSIS COMPLETE")
print("="*60)
============================================================ MODEL COMPARISON SUMMARY ============================================================ Behavior Main_R2 Interaction_R2 R2_Improvement F_Statistic P_Value \ 0 Walk 0.4022 0.4084 0.0062 0.8181 0.3685 1 Head Up 0.2500 0.2546 0.0046 0.4824 0.4894 2 Graze 0.5621 0.5628 0.0008 0.1373 0.7120 3 Other 0.0016 0.0082 0.0066 0.5200 0.4730 Interaction_Significant 0 False 1 False 2 False 3 False Number of behaviors with significant interactions: 0/4 ============================================================ RECOMMENDATION ============================================================ ❌ NO SIGNIFICANT INTERACTIONS DETECTED Recommendation: Stick with the main effects model. The habitat and herd size effects appear to be additive - you can predict the combined effect by simply adding the individual effects together. Detailed results saved to 'interaction_model_comparison.csv' ============================================================ INTERACTION ANALYSIS COMPLETE ============================================================