Grevy's Zebra Time Budget Analysis: Linear Regression¶

Comparing Habitat and Herd Size Effects on Behavior

In [2]:

Copied!





import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [3]:

Copied!





# ============================================================================
# 1. DATA LOADING AND PREPROCESSING
# ============================================================================

# Load the dataset
df = pd.read_csv('data/grevystimebudgetscleaned.csv')

print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())

# Check data types and missing values
print(f"\nData Types:")
print(df.dtypes)
print(f"\nMissing Values:")
print(df.isnull().sum())

# Examine categorical variables
print(f"\nUnique Habitats: {df['Habitat'].unique()}")
print(f"Unique Herd Sizes: {df['Herd Size'].unique()}")
# ============================================================================
# 1. DATA LOADING AND PREPROCESSING
# ============================================================================

# Load the dataset
df = pd.read_csv('data/grevystimebudgetscleaned.csv')

print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())

# Check data types and missing values
print(f"\nData Types:")
print(df.dtypes)
print(f"\nMissing Values:")
print(df.isnull().sum())

# Examine categorical variables
print(f"\nUnique Habitats: {df['Habitat'].unique()}")
print(f"Unique Herd Sizes: {df['Herd Size'].unique()}")

Dataset Overview:
Shape: (82, 9)

Columns: ['Sesssion', 'mini_scene_id', 'Walk', 'Head Up', 'Graze', 'Other', 'Interval(sec)', 'Habitat', 'Herd Size']

First few rows:
   Sesssion         mini_scene_id      Walk   Head Up     Graze     Other  \
0         1   11_01_23-DJI_0977_2  0.442609  0.417140  0.096197  0.044054   
1         1   11_01_23-DJI_0977_3  0.513430  0.371257  0.066039  0.049273   
2         1   11_01_23-DJI_0977_1  0.557903  0.358236  0.032945  0.050915   
3         2   11_01_23-DJI_0488_9  0.016229  0.983771  0.000000  0.000000   
4         2  11_01_23-DJI_0488_45  0.000000  0.745382  0.000000  0.254618   

   Interval(sec) Habitat Herd Size  
0     193.700000  Closed     Small  
1     194.833333  Closed     Small  
2     200.333333  Closed     Small  
3     112.966667    Open     Small  
4     108.266667    Open     Small  

Data Types:
Sesssion           int64
mini_scene_id     object
Walk             float64
Head Up          float64
Graze            float64
Other            float64
Interval(sec)    float64
Habitat           object
Herd Size         object
dtype: object

Missing Values:
Sesssion         0
mini_scene_id    0
Walk             0
Head Up          0
Graze            0
Other            0
Interval(sec)    0
Habitat          0
Herd Size        0
dtype: int64

Unique Habitats: ['Closed' 'Open']
Unique Herd Sizes: ['Small' 'Large']

In [4]:

Copied!





# ============================================================================
# 2. EXPLORATORY DATA ANALYSIS
# ============================================================================

# Behavior variables (dependent variables)
behaviors = ['Walk', 'Head Up', 'Graze', 'Other']

# Summary statistics for behaviors by habitat and herd size
print("\n" + "="*60)
print("SUMMARY STATISTICS BY HABITAT AND HERD SIZE")
print("="*60)

for behavior in behaviors:
    print(f"\n{behavior}:")
    print("By Habitat:")
    print(df.groupby('Habitat')[behavior].agg(['count', 'mean', 'std']).round(4))
    print("\nBy Herd Size:")
    print(df.groupby('Herd Size')[behavior].agg(['count', 'mean', 'std']).round(4))

# Create visualization of behavior patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Grevy\'s Zebra Behavior Patterns by Habitat and Herd Size', fontsize=16, y=0.98)

for i, behavior in enumerate(behaviors):
    row, col = i // 2, i % 2
    
    # Create subplot with both habitat and herd size
    ax = axes[row, col]
    
    # Box plot showing distribution by habitat and herd size
    df_plot = df.melt(id_vars=['Habitat', 'Herd Size'], 
                      value_vars=[behavior], 
                      var_name='Behavior', 
                      value_name='Proportion')
    
    sns.boxplot(data=df_plot, x='Habitat', y='Proportion', hue='Herd Size', ax=ax)
    ax.set_title(f'{behavior} by Habitat and Herd Size')
    ax.set_ylabel('Proportion of Time')

plt.tight_layout()
plt.show()
# ============================================================================
# 2. EXPLORATORY DATA ANALYSIS
# ============================================================================

# Behavior variables (dependent variables)
behaviors = ['Walk', 'Head Up', 'Graze', 'Other']

# Summary statistics for behaviors by habitat and herd size
print("\n" + "="*60)
print("SUMMARY STATISTICS BY HABITAT AND HERD SIZE")
print("="*60)

for behavior in behaviors:
    print(f"\n{behavior}:")
    print("By Habitat:")
    print(df.groupby('Habitat')[behavior].agg(['count', 'mean', 'std']).round(4))
    print("\nBy Herd Size:")
    print(df.groupby('Herd Size')[behavior].agg(['count', 'mean', 'std']).round(4))

# Create visualization of behavior patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Grevy\'s Zebra Behavior Patterns by Habitat and Herd Size', fontsize=16, y=0.98)

for i, behavior in enumerate(behaviors):
    row, col = i // 2, i % 2
    
    # Create subplot with both habitat and herd size
    ax = axes[row, col]
    
    # Box plot showing distribution by habitat and herd size
    df_plot = df.melt(id_vars=['Habitat', 'Herd Size'], 
                      value_vars=[behavior], 
                      var_name='Behavior', 
                      value_name='Proportion')
    
    sns.boxplot(data=df_plot, x='Habitat', y='Proportion', hue='Herd Size', ax=ax)
    ax.set_title(f'{behavior} by Habitat and Herd Size')
    ax.set_ylabel('Proportion of Time')

plt.tight_layout()
plt.show()

============================================================
SUMMARY STATISTICS BY HABITAT AND HERD SIZE
============================================================

Walk:
By Habitat:
         count    mean     std
Habitat                       
Closed      32  0.1000  0.1557
Open        50  0.2493  0.2701

By Herd Size:
           count    mean     std
Herd Size                       
Large         59  0.0992  0.1370
Small         23  0.4266  0.2936

Head Up:
By Habitat:
         count    mean     std
Habitat                       
Closed      32  0.2086  0.2867
Open        50  0.2657  0.2907

By Herd Size:
           count    mean     std
Herd Size                       
Large         59  0.1539  0.2281
Small         23  0.4731  0.3050

Graze:
By Habitat:
         count    mean     std
Habitat                       
Closed      32  0.6148  0.3936
Open        50  0.4085  0.3871

By Herd Size:
           count    mean     std
Herd Size                       
Large         59  0.6739  0.3153
Small         23  0.0149  0.0314

Other:
By Habitat:
         count    mean     std
Habitat                       
Closed      32  0.0765  0.1507
Open        50  0.0765  0.1391

By Herd Size:
           count    mean     std
Herd Size                       
Large         59  0.0731  0.1540
Small         23  0.0854  0.1114

No description has been provided for this image

In [11]:

Copied!





# ============================================================================
# 3. LINEAR REGRESSION ANALYSIS
# ============================================================================

# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()

df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])

# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))

print("\n" + "="*60)
print("ENCODING MAPPINGS")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")

# Prepare predictors
X = df[['Habitat_encoded', 'HerdSize_encoded']]
feature_names = ['Habitat', 'Herd Size']

# Store results for comparison
regression_results = {}

print("\n" + "="*60)
print("LINEAR REGRESSION RESULTS")
print("="*60)

# save significance levels
significance_levels = []

for behavior in behaviors:
    print(f"\n{'-'*40}")
    print(f"BEHAVIOR: {behavior}")
    print(f"{'-'*40}")
    
    # Fit linear regression
    y = df[behavior]
    model = LinearRegression()
    model.fit(X, y)
    
    # Predictions and R²
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    
    # Store results
    regression_results[behavior] = {
        'coefficients': model.coef_,
        'intercept': model.intercept_,
        'r2': r2,
        'model': model
    }
    
    print(f"R² Score: {r2:.4f}")
    print(f"Intercept: {model.intercept_:.4f}")
    print("Coefficients:")
    for i, (coef, feature) in enumerate(zip(model.coef_, feature_names)):
        print(f"  {feature}: {coef:.4f}")
    
    # Statistical significance testing
    # Calculate t-statistics and p-values
    n = len(y)
    k = len(model.coef_)
    
    # Residual sum of squares
    y_pred = model.predict(X)
    residuals = y - y_pred
    rss = np.sum(residuals**2)
    
    # Mean squared error
    mse = rss / (n - k - 1)
    
    # Standard errors of coefficients
    X_with_intercept = np.column_stack([np.ones(n), X])
    try:
        cov_matrix = mse * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
        se_coefs = np.sqrt(np.diag(cov_matrix))[1:]  # Exclude intercept
        
        # t-statistics and p-values
        t_stats = model.coef_ / se_coefs
        p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k - 1))
        
        print("Statistical Significance:")
        for i, (feature, t_stat, p_val) in enumerate(zip(feature_names, t_stats, p_values)):
            significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
            print(f"  {feature}: t={t_stat:.3f}, p={p_val:.4f} {significance}")
            
        # save t_stats and p_values for later use
        significance_levels.append((behavior, feature_names, t_stats, p_values))
    
    except np.linalg.LinAlgError:
        print("Could not calculate statistical significance (singular matrix)")
# ============================================================================
# 3. LINEAR REGRESSION ANALYSIS
# ============================================================================

# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()

df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])

# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))

print("\n" + "="*60)
print("ENCODING MAPPINGS")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")

# Prepare predictors
X = df[['Habitat_encoded', 'HerdSize_encoded']]
feature_names = ['Habitat', 'Herd Size']

# Store results for comparison
regression_results = {}

print("\n" + "="*60)
print("LINEAR REGRESSION RESULTS")
print("="*60)

# save significance levels
significance_levels = []

for behavior in behaviors:
    print(f"\n{'-'*40}")
    print(f"BEHAVIOR: {behavior}")
    print(f"{'-'*40}")
    
    # Fit linear regression
    y = df[behavior]
    model = LinearRegression()
    model.fit(X, y)
    
    # Predictions and R²
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    
    # Store results
    regression_results[behavior] = {
        'coefficients': model.coef_,
        'intercept': model.intercept_,
        'r2': r2,
        'model': model
    }
    
    print(f"R² Score: {r2:.4f}")
    print(f"Intercept: {model.intercept_:.4f}")
    print("Coefficients:")
    for i, (coef, feature) in enumerate(zip(model.coef_, feature_names)):
        print(f"  {feature}: {coef:.4f}")
    
    # Statistical significance testing
    # Calculate t-statistics and p-values
    n = len(y)
    k = len(model.coef_)
    
    # Residual sum of squares
    y_pred = model.predict(X)
    residuals = y - y_pred
    rss = np.sum(residuals**2)
    
    # Mean squared error
    mse = rss / (n - k - 1)
    
    # Standard errors of coefficients
    X_with_intercept = np.column_stack([np.ones(n), X])
    try:
        cov_matrix = mse * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
        se_coefs = np.sqrt(np.diag(cov_matrix))[1:]  # Exclude intercept
        
        # t-statistics and p-values
        t_stats = model.coef_ / se_coefs
        p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k - 1))
        
        print("Statistical Significance:")
        for i, (feature, t_stat, p_val) in enumerate(zip(feature_names, t_stats, p_values)):
            significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
            print(f"  {feature}: t={t_stat:.3f}, p={p_val:.4f} {significance}")
            
        # save t_stats and p_values for later use
        significance_levels.append((behavior, feature_names, t_stats, p_values))
    
    except np.linalg.LinAlgError:
        print("Could not calculate statistical significance (singular matrix)")

============================================================
ENCODING MAPPINGS
============================================================
Habitat encoding: {0: 'Closed', 1: 'Open'}
Herd Size encoding: {0: 'Large', 1: 'Small'}

============================================================
LINEAR REGRESSION RESULTS
============================================================

----------------------------------------
BEHAVIOR: Walk
----------------------------------------
R² Score: 0.4022
Intercept: 0.0521
Coefficients:
  Habitat: 0.0868
  Herd Size: 0.3066
Statistical Significance:
  Habitat: t=1.971, p=0.0523 
  Herd Size: t=6.409, p=0.0000 ***

----------------------------------------
BEHAVIOR: Head Up
----------------------------------------
R² Score: 0.2500
Intercept: 0.1584
Coefficients:
  Habitat: -0.0084
  Herd Size: 0.3213
Statistical Significance:
  Habitat: t=-0.143, p=0.8864 
  Herd Size: t=5.034, p=0.0000 ***

----------------------------------------
BEHAVIOR: Graze
----------------------------------------
R² Score: 0.5621
Intercept: 0.7150
Coefficients:
  Habitat: -0.0757
  Herd Size: -0.6408
Statistical Significance:
  Habitat: t=-1.216, p=0.2275 
  Herd Size: t=-9.479, p=0.0000 ***

----------------------------------------
BEHAVIOR: Other
----------------------------------------
R² Score: 0.0016
Intercept: 0.0745
Coefficients:
  Habitat: -0.0027
  Herd Size: 0.0129
Statistical Significance:
  Habitat: t=-0.080, p=0.9368 
  Herd Size: t=0.355, p=0.7232

In [12]:

Copied!

significance_levels
significance_levels

Out[12]:

[('Walk',
  ['Habitat', 'Herd Size'],
  array([1.9706704 , 6.40901791]),
  array([5.22644921e-02, 9.83405646e-09])),
 ('Head Up',
  ['Habitat', 'Herd Size'],
  array([-0.14331907,  5.03400118]),
  array([8.86403135e-01, 2.95772428e-06])),
 ('Graze',
  ['Habitat', 'Herd Size'],
  array([-1.21636233, -9.47907617]),
  array([2.27468742e-01, 1.13242749e-14])),
 ('Other',
  ['Habitat', 'Herd Size'],
  array([-0.07960377,  0.35540207]),
  array([0.9367537 , 0.72323558]))]

In [14]:

Copied!





# print the regression results, coefficients, and statistical significance in table for manuscript
print("\n" + "="*60)
print("REGRESSION SUMMARY TABLE")
print("="*60)
summary_table = []
for behavior, results in regression_results.items():
    row = {
        'Behavior': behavior,
        'R²': results['r2'],
        'Intercept': results['intercept']
    }
    for i, feature in enumerate(feature_names):
        row[f'Coef_{feature}'] = results['coefficients'][i]
    summary_table.append(row)
    
    for sig in significance_levels:
        if sig[0] == behavior:
            for i, feature in enumerate(sig[1]):
                row[f't_{feature}'] = sig[2][i]
                row[f'p_{feature}'] = sig[3][i]
    
    # add checkmark for significant or not
    for feature in feature_names:
        p_val = row.get(f'p_{feature}', 1)
        row[f'Significant_{feature}'] = '✓' if p_val < 0.05 else ''
        
        
    
    
summary_df = pd.DataFrame(summary_table)
print(summary_df.round(4))

# Save summary table to CSV
summary_df.to_csv('regression_summary.csv', index=False)
print("\nRegression summary saved to 'regression_summary.csv'")
# ============================================================================
# print the regression results, coefficients, and statistical significance in table for manuscript
print("\n" + "="*60)
print("REGRESSION SUMMARY TABLE")
print("="*60)
summary_table = []
for behavior, results in regression_results.items():
    row = {
        'Behavior': behavior,
        'R²': results['r2'],
        'Intercept': results['intercept']
    }
    for i, feature in enumerate(feature_names):
        row[f'Coef_{feature}'] = results['coefficients'][i]
    summary_table.append(row)
    
    for sig in significance_levels:
        if sig[0] == behavior:
            for i, feature in enumerate(sig[1]):
                row[f't_{feature}'] = sig[2][i]
                row[f'p_{feature}'] = sig[3][i]
    
    # add checkmark for significant or not
    for feature in feature_names:
        p_val = row.get(f'p_{feature}', 1)
        row[f'Significant_{feature}'] = '✓' if p_val < 0.05 else ''
        
        
    
    
summary_df = pd.DataFrame(summary_table)
print(summary_df.round(4))

# Save summary table to CSV
summary_df.to_csv('regression_summary.csv', index=False)
print("\nRegression summary saved to 'regression_summary.csv'")
# ============================================================================

============================================================
REGRESSION SUMMARY TABLE
============================================================
  Behavior      R²  Intercept  Coef_Habitat  Coef_Herd Size  t_Habitat  \
0     Walk  0.4022     0.0521        0.0868          0.3066     1.9707   
1  Head Up  0.2500     0.1584       -0.0084          0.3213    -0.1433   
2    Graze  0.5621     0.7150       -0.0757         -0.6408    -1.2164   
3    Other  0.0016     0.0745       -0.0027          0.0129    -0.0796   

   p_Habitat  t_Herd Size  p_Herd Size Significant_Habitat  \
0     0.0523       6.4090       0.0000                       
1     0.8864       5.0340       0.0000                       
2     0.2275      -9.4791       0.0000                       
3     0.9368       0.3554       0.7232                       

  Significant_Herd Size  
0                     ✓  
1                     ✓  
2                     ✓  
3                        

Regression summary saved to 'regression_summary.csv'

In [6]:

Copied!





# ============================================================================
# 4. COEFFICIENT COMPARISON VISUALIZATION
# ============================================================================

# Create coefficient comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Linear Regression Coefficients: Habitat and Herd Size Effects on Behaviors', 
             fontsize=16, y=0.98)

# Prepare data for plotting
coef_data = []
for behavior in behaviors:
    for i, feature in enumerate(feature_names):
        coef_data.append({
            'Behavior': behavior,
            'Predictor': feature,
            'Coefficient': regression_results[behavior]['coefficients'][i],
            'R²': regression_results[behavior]['r2']
        })

coef_df = pd.DataFrame(coef_data)

# Plot 1: Habitat coefficients
ax1 = axes[0, 0]
habitat_coefs = coef_df[coef_df['Predictor'] == 'Habitat']
bars1 = ax1.bar(habitat_coefs['Behavior'], habitat_coefs['Coefficient'], 
                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax1.set_title('Habitat Effect on Behaviors\n(Closed=0, Open=1)')
ax1.set_ylabel('Coefficient Value')
ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax1.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar, value in zip(bars1, habitat_coefs['Coefficient']):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
             f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')

# Plot 2: Herd Size coefficients
ax2 = axes[0, 1]
herdsize_coefs = coef_df[coef_df['Predictor'] == 'Herd Size']
bars2 = ax2.bar(herdsize_coefs['Behavior'], herdsize_coefs['Coefficient'],
                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax2.set_title('Herd Size Effect on Behaviors\n(Large=0, Small=1)')
ax2.set_ylabel('Coefficient Value')
ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax2.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar, value in zip(bars2, herdsize_coefs['Coefficient']):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
             f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')

# Plot 3: R² values comparison
ax3 = axes[1, 0]
r2_values = [regression_results[behavior]['r2'] for behavior in behaviors]
bars3 = ax3.bar(behaviors, r2_values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax3.set_title('Model Performance (R² values)')
ax3.set_ylabel('R² Score')
ax3.set_ylim(0, max(r2_values) * 1.1)
ax3.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar, value in zip(bars3, r2_values):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.005,
             f'{value:.3f}', ha='center', va='bottom')

# Plot 4: Coefficient comparison heatmap
ax4 = axes[1, 1]
pivot_coefs = coef_df.pivot(index='Behavior', columns='Predictor', values='Coefficient')
sns.heatmap(pivot_coefs, annot=True, cmap='RdBu_r', center=0, ax=ax4, 
            fmt='.3f', cbar_kws={'label': 'Coefficient Value'})
ax4.set_title('Coefficient Heatmap')

plt.tight_layout()
plt.show()
# ============================================================================
# 4. COEFFICIENT COMPARISON VISUALIZATION
# ============================================================================

# Create coefficient comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Linear Regression Coefficients: Habitat and Herd Size Effects on Behaviors', 
             fontsize=16, y=0.98)

# Prepare data for plotting
coef_data = []
for behavior in behaviors:
    for i, feature in enumerate(feature_names):
        coef_data.append({
            'Behavior': behavior,
            'Predictor': feature,
            'Coefficient': regression_results[behavior]['coefficients'][i],
            'R²': regression_results[behavior]['r2']
        })

coef_df = pd.DataFrame(coef_data)

# Plot 1: Habitat coefficients
ax1 = axes[0, 0]
habitat_coefs = coef_df[coef_df['Predictor'] == 'Habitat']
bars1 = ax1.bar(habitat_coefs['Behavior'], habitat_coefs['Coefficient'], 
                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax1.set_title('Habitat Effect on Behaviors\n(Closed=0, Open=1)')
ax1.set_ylabel('Coefficient Value')
ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax1.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar, value in zip(bars1, habitat_coefs['Coefficient']):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
             f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')

# Plot 2: Herd Size coefficients
ax2 = axes[0, 1]
herdsize_coefs = coef_df[coef_df['Predictor'] == 'Herd Size']
bars2 = ax2.bar(herdsize_coefs['Behavior'], herdsize_coefs['Coefficient'],
                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax2.set_title('Herd Size Effect on Behaviors\n(Large=0, Small=1)')
ax2.set_ylabel('Coefficient Value')
ax2.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax2.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar, value in zip(bars2, herdsize_coefs['Coefficient']):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + (0.01 if height > 0 else -0.02),
             f'{value:.3f}', ha='center', va='bottom' if height > 0 else 'top')

# Plot 3: R² values comparison
ax3 = axes[1, 0]
r2_values = [regression_results[behavior]['r2'] for behavior in behaviors]
bars3 = ax3.bar(behaviors, r2_values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax3.set_title('Model Performance (R² values)')
ax3.set_ylabel('R² Score')
ax3.set_ylim(0, max(r2_values) * 1.1)
ax3.tick_params(axis='x', rotation=45)

# Add value labels on bars
for bar, value in zip(bars3, r2_values):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 0.005,
             f'{value:.3f}', ha='center', va='bottom')

# Plot 4: Coefficient comparison heatmap
ax4 = axes[1, 1]
pivot_coefs = coef_df.pivot(index='Behavior', columns='Predictor', values='Coefficient')
sns.heatmap(pivot_coefs, annot=True, cmap='RdBu_r', center=0, ax=ax4, 
            fmt='.3f', cbar_kws={'label': 'Coefficient Value'})
ax4.set_title('Coefficient Heatmap')

plt.tight_layout()
plt.show()

In [7]:

Copied!





# ============================================================================
# 5. DETAILED INTERPRETATION
# ============================================================================

print("\n" + "="*60)
print("INTERPRETATION OF RESULTS")
print("="*60)

print("\nCOEFFICIENT INTERPRETATION:")
print("-" * 30)
print("Habitat coefficients (Closed=0, Open=1):")
print("  Positive coefficient = behavior increases in Open habitat")
print("  Negative coefficient = behavior decreases in Open habitat")
print("\nHerd Size coefficients (Large=0, Small=1):")
print("  Positive coefficient = behavior increases in Small herds")
print("  Negative coefficient = behavior decreases in Small herds")

print("\nKEY FINDINGS:")
print("-" * 15)
for behavior in behaviors:
    habitat_coef = regression_results[behavior]['coefficients'][0]
    herdsize_coef = regression_results[behavior]['coefficients'][1]
    r2 = regression_results[behavior]['r2']
    
    print(f"\n{behavior}:")
    print(f"  Model explains {r2*100:.1f}% of variance")
    
    # Habitat effect
    if abs(habitat_coef) > 0.01:  # Threshold for meaningful effect
        direction = "higher" if habitat_coef > 0 else "lower"
        print(f"  Habitat: {direction} in Open vs Closed ({habitat_coef:+.3f})")
    else:
        print(f"  Habitat: minimal effect ({habitat_coef:+.3f})")
    
    # Herd size effect
    if abs(herdsize_coef) > 0.01:
        direction = "higher" if herdsize_coef > 0 else "lower"
        print(f"  Herd Size: {direction} in Small vs Large herds ({herdsize_coef:+.3f})")
    else:
        print(f"  Herd Size: minimal effect ({herdsize_coef:+.3f})")
# ============================================================================
# 5. DETAILED INTERPRETATION
# ============================================================================

print("\n" + "="*60)
print("INTERPRETATION OF RESULTS")
print("="*60)

print("\nCOEFFICIENT INTERPRETATION:")
print("-" * 30)
print("Habitat coefficients (Closed=0, Open=1):")
print("  Positive coefficient = behavior increases in Open habitat")
print("  Negative coefficient = behavior decreases in Open habitat")
print("\nHerd Size coefficients (Large=0, Small=1):")
print("  Positive coefficient = behavior increases in Small herds")
print("  Negative coefficient = behavior decreases in Small herds")

print("\nKEY FINDINGS:")
print("-" * 15)
for behavior in behaviors:
    habitat_coef = regression_results[behavior]['coefficients'][0]
    herdsize_coef = regression_results[behavior]['coefficients'][1]
    r2 = regression_results[behavior]['r2']
    
    print(f"\n{behavior}:")
    print(f"  Model explains {r2*100:.1f}% of variance")
    
    # Habitat effect
    if abs(habitat_coef) > 0.01:  # Threshold for meaningful effect
        direction = "higher" if habitat_coef > 0 else "lower"
        print(f"  Habitat: {direction} in Open vs Closed ({habitat_coef:+.3f})")
    else:
        print(f"  Habitat: minimal effect ({habitat_coef:+.3f})")
    
    # Herd size effect
    if abs(herdsize_coef) > 0.01:
        direction = "higher" if herdsize_coef > 0 else "lower"
        print(f"  Herd Size: {direction} in Small vs Large herds ({herdsize_coef:+.3f})")
    else:
        print(f"  Herd Size: minimal effect ({herdsize_coef:+.3f})")

============================================================
INTERPRETATION OF RESULTS
============================================================

COEFFICIENT INTERPRETATION:
------------------------------
Habitat coefficients (Closed=0, Open=1):
  Positive coefficient = behavior increases in Open habitat
  Negative coefficient = behavior decreases in Open habitat

Herd Size coefficients (Large=0, Small=1):
  Positive coefficient = behavior increases in Small herds
  Negative coefficient = behavior decreases in Small herds

KEY FINDINGS:
---------------

Walk:
  Model explains 40.2% of variance
  Habitat: higher in Open vs Closed (+0.087)
  Herd Size: higher in Small vs Large herds (+0.307)

Head Up:
  Model explains 25.0% of variance
  Habitat: minimal effect (-0.008)
  Herd Size: higher in Small vs Large herds (+0.321)

Graze:
  Model explains 56.2% of variance
  Habitat: lower in Open vs Closed (-0.076)
  Herd Size: lower in Small vs Large herds (-0.641)

Other:
  Model explains 0.2% of variance
  Habitat: minimal effect (-0.003)
  Herd Size: higher in Small vs Large herds (+0.013)

In [8]:

Copied!





# ============================================================================
# 6. RESIDUAL ANALYSIS
# ============================================================================

print("\n" + "="*60)
print("RESIDUAL ANALYSIS")
print("="*60)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Residual Analysis for Linear Regression Models', fontsize=16, y=0.98)

for i, behavior in enumerate(behaviors):
    ax = axes[i//2, i%2]
    
    # Get predictions and residuals
    model = regression_results[behavior]['model']
    y_true = df[behavior]
    y_pred = model.predict(X)
    residuals = y_true - y_pred
    
    # Residuals vs Predicted plot
    ax.scatter(y_pred, residuals, alpha=0.6)
    ax.axhline(y=0, color='red', linestyle='--')
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Residuals')
    ax.set_title(f'{behavior} - Residuals vs Predicted')
    
    # Add trend line
    z = np.polyfit(y_pred, residuals, 1)
    p = np.poly1d(z)
    ax.plot(sorted(y_pred), p(sorted(y_pred)), "r--", alpha=0.3)

plt.tight_layout()
plt.show()

print("\nModel assumptions check:")
print("- Look for random scatter around y=0 in residual plots")
print("- Non-random patterns may indicate model inadequacy")
print("- Consider transformations if patterns are evident")
# ============================================================================
# 6. RESIDUAL ANALYSIS
# ============================================================================

print("\n" + "="*60)
print("RESIDUAL ANALYSIS")
print("="*60)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Residual Analysis for Linear Regression Models', fontsize=16, y=0.98)

for i, behavior in enumerate(behaviors):
    ax = axes[i//2, i%2]
    
    # Get predictions and residuals
    model = regression_results[behavior]['model']
    y_true = df[behavior]
    y_pred = model.predict(X)
    residuals = y_true - y_pred
    
    # Residuals vs Predicted plot
    ax.scatter(y_pred, residuals, alpha=0.6)
    ax.axhline(y=0, color='red', linestyle='--')
    ax.set_xlabel('Predicted Values')
    ax.set_ylabel('Residuals')
    ax.set_title(f'{behavior} - Residuals vs Predicted')
    
    # Add trend line
    z = np.polyfit(y_pred, residuals, 1)
    p = np.poly1d(z)
    ax.plot(sorted(y_pred), p(sorted(y_pred)), "r--", alpha=0.3)

plt.tight_layout()
plt.show()

print("\nModel assumptions check:")
print("- Look for random scatter around y=0 in residual plots")
print("- Non-random patterns may indicate model inadequacy")
print("- Consider transformations if patterns are evident")

============================================================
RESIDUAL ANALYSIS
============================================================

Model assumptions check:
- Look for random scatter around y=0 in residual plots
- Non-random patterns may indicate model inadequacy
- Consider transformations if patterns are evident

In [9]:

Copied!





# ============================================================================
# 7. SUMMARY TABLE
# ============================================================================

print("\n" + "="*60)
print("SUMMARY TABLE")
print("="*60)

summary_df = pd.DataFrame({
    'Behavior': behaviors,
    'R²': [regression_results[b]['r2'] for b in behaviors],
    'Habitat_Coef': [regression_results[b]['coefficients'][0] for b in behaviors],
    'HerdSize_Coef': [regression_results[b]['coefficients'][1] for b in behaviors],
    'Intercept': [regression_results[b]['intercept'] for b in behaviors]
})

print(summary_df.round(4).to_string(index=False))

# Save results to CSV
summary_df.to_csv('regression_results_summary.csv', index=False)
print(f"\nResults saved to 'regression_results_summary.csv'")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)
# ============================================================================
# 7. SUMMARY TABLE
# ============================================================================

print("\n" + "="*60)
print("SUMMARY TABLE")
print("="*60)

summary_df = pd.DataFrame({
    'Behavior': behaviors,
    'R²': [regression_results[b]['r2'] for b in behaviors],
    'Habitat_Coef': [regression_results[b]['coefficients'][0] for b in behaviors],
    'HerdSize_Coef': [regression_results[b]['coefficients'][1] for b in behaviors],
    'Intercept': [regression_results[b]['intercept'] for b in behaviors]
})

print(summary_df.round(4).to_string(index=False))

# Save results to CSV
summary_df.to_csv('regression_results_summary.csv', index=False)
print(f"\nResults saved to 'regression_results_summary.csv'")

print("\n" + "="*60)
print("ANALYSIS COMPLETE")
print("="*60)

============================================================
SUMMARY TABLE
============================================================
Behavior     R²  Habitat_Coef  HerdSize_Coef  Intercept
    Walk 0.4022        0.0868         0.3066     0.0521
 Head Up 0.2500       -0.0084         0.3213     0.1584
   Graze 0.5621       -0.0757        -0.6408     0.7150
   Other 0.0016       -0.0027         0.0129     0.0745

Results saved to 'regression_results_summary.csv'

============================================================
ANALYSIS COMPLETE
============================================================

Analysis of interactions between habitat and herd size¶

In [10]:

Copied!





# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()

df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])

# Create interaction term
df['Habitat_HerdSize_interaction'] = df['Habitat_encoded'] * df['HerdSize_encoded']

# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))

print("\n" + "="*60)
print("ENCODING AND INTERACTION SETUP")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")
print("\nInteraction term interpretation:")
print("  0: Closed + Large (baseline)")
print("  0: Open + Large (habitat effect only)")
print("  0: Closed + Small (herd size effect only)")
print("  1: Open + Small (habitat + herd size + interaction)")

# Check distribution of combinations
print("\nDistribution of combinations:")
combo_counts = df.groupby(['Habitat', 'Herd Size']).size()
print(combo_counts)

# Encode categorical variables
le_habitat = LabelEncoder()
le_herd_size = LabelEncoder()

df['Habitat_encoded'] = le_habitat.fit_transform(df['Habitat'])
df['HerdSize_encoded'] = le_herd_size.fit_transform(df['Herd Size'])

# Create interaction term
df['Habitat_HerdSize_interaction'] = df['Habitat_encoded'] * df['HerdSize_encoded']

# Create mapping for interpretation
habitat_mapping = dict(zip(le_habitat.transform(le_habitat.classes_), le_habitat.classes_))
herdsize_mapping = dict(zip(le_herd_size.transform(le_herd_size.classes_), le_herd_size.classes_))

print("\n" + "="*60)
print("ENCODING AND INTERACTION SETUP")
print("="*60)
print(f"Habitat encoding: {habitat_mapping}")
print(f"Herd Size encoding: {herdsize_mapping}")
print("\nInteraction term interpretation:")
print("  0: Closed + Large (baseline)")
print("  0: Open + Large (habitat effect only)")
print("  0: Closed + Small (herd size effect only)")
print("  1: Open + Small (habitat + herd size + interaction)")

# Check distribution of combinations
print("\nDistribution of combinations:")
combo_counts = df.groupby(['Habitat', 'Herd Size']).size()
print(combo_counts)

============================================================
ENCODING AND INTERACTION SETUP
============================================================
Habitat encoding: {0: 'Closed', 1: 'Open'}
Herd Size encoding: {0: 'Large', 1: 'Small'}

Interaction term interpretation:
  0: Closed + Large (baseline)
  0: Open + Large (habitat effect only)
  0: Closed + Small (herd size effect only)
  1: Open + Small (habitat + herd size + interaction)

Distribution of combinations:
Habitat  Herd Size
Closed   Large        27
         Small         5
Open     Large        32
         Small        18
dtype: int64

In [11]:

Copied!





# ============================================================================
# 2. MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS
# ============================================================================

behaviors = ['Walk', 'Head Up', 'Graze', 'Other']

# Prepare predictors for both models
X_main = df[['Habitat_encoded', 'HerdSize_encoded']]  # Main effects only
X_interaction = df[['Habitat_encoded', 'HerdSize_encoded', 'Habitat_HerdSize_interaction']]  # With interaction

# Store results for comparison
main_effects_results = {}
interaction_results = {}
model_comparison = {}

print("\n" + "="*60)
print("MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS")
print("="*60)

for behavior in behaviors:
    print(f"\n{'-'*50}")
    print(f"BEHAVIOR: {behavior}")
    print(f"{'-'*50}")
    
    y = df[behavior]
    
    # ===================
    # MAIN EFFECTS MODEL
    # ===================
    model_main = LinearRegression()
    model_main.fit(X_main, y)
    y_pred_main = model_main.predict(X_main)
    r2_main = r2_score(y, y_pred_main)
    
    main_effects_results[behavior] = {
        'model': model_main,
        'r2': r2_main,
        'coefficients': model_main.coef_,
        'intercept': model_main.intercept_,
        'predictions': y_pred_main
    }
    
    # ===================
    # INTERACTION MODEL
    # ===================
    model_interaction = LinearRegression()
    model_interaction.fit(X_interaction, y)
    y_pred_interaction = model_interaction.predict(X_interaction)
    r2_interaction = r2_score(y, y_pred_interaction)
    
    interaction_results[behavior] = {
        'model': model_interaction,
        'r2': r2_interaction,
        'coefficients': model_interaction.coef_,
        'intercept': model_interaction.intercept_,
        'predictions': y_pred_interaction
    }
    
    # ===================
    # F-TEST FOR INTERACTION SIGNIFICANCE
    # ===================
    # Calculate F-statistic to test if interaction model is significantly better
    n = len(y)
    
    # Residual Sum of Squares
    rss_main = np.sum((y - y_pred_main)**2)
    rss_interaction = np.sum((y - y_pred_interaction)**2)
    
    # Degrees of freedom
    df_main = n - X_main.shape[1] - 1  # n - p - 1
    df_interaction = n - X_interaction.shape[1] - 1
    
    # F-statistic for interaction term
    f_stat = ((rss_main - rss_interaction) / 1) / (rss_interaction / df_interaction)
    p_value = 1 - stats.f.cdf(f_stat, 1, df_interaction)
    
    # Store comparison results
    model_comparison[behavior] = {
        'r2_improvement': r2_interaction - r2_main,
        'f_statistic': f_stat,
        'p_value': p_value,
        'interaction_significant': p_value < 0.05
    }
    
    # ===================
    # DISPLAY RESULTS
    # ===================
    print(f"\nMAIN EFFECTS MODEL:")
    print(f"  R² = {r2_main:.4f}")
    print(f"  Intercept: {model_main.intercept_:.4f}")
    print(f"  Habitat coef: {model_main.coef_[0]:.4f}")
    print(f"  Herd Size coef: {model_main.coef_[1]:.4f}")
    
    print(f"\nINTERACTION MODEL:")
    print(f"  R² = {r2_interaction:.4f}")
    print(f"  Intercept: {model_interaction.intercept_:.4f}")
    print(f"  Habitat coef: {model_interaction.coef_[0]:.4f}")
    print(f"  Herd Size coef: {model_interaction.coef_[1]:.4f}")
    print(f"  Interaction coef: {model_interaction.coef_[2]:.4f}")
    
    print(f"\nMODEL COMPARISON:")
    print(f"  R² improvement: {r2_interaction - r2_main:.4f}")
    print(f"  F-statistic: {f_stat:.3f}")
    print(f"  p-value: {p_value:.4f}")
    
    significance = ""
    if p_value < 0.001:
        significance = "*** (highly significant)"
    elif p_value < 0.01:
        significance = "** (very significant)"
    elif p_value < 0.05:
        significance = "* (significant)"
    else:
        significance = "(not significant)"
    
    print(f"  Interaction effect: {significance}")
# ============================================================================
# 2. MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS
# ============================================================================

behaviors = ['Walk', 'Head Up', 'Graze', 'Other']

# Prepare predictors for both models
X_main = df[['Habitat_encoded', 'HerdSize_encoded']]  # Main effects only
X_interaction = df[['Habitat_encoded', 'HerdSize_encoded', 'Habitat_HerdSize_interaction']]  # With interaction

# Store results for comparison
main_effects_results = {}
interaction_results = {}
model_comparison = {}

print("\n" + "="*60)
print("MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS")
print("="*60)

for behavior in behaviors:
    print(f"\n{'-'*50}")
    print(f"BEHAVIOR: {behavior}")
    print(f"{'-'*50}")
    
    y = df[behavior]
    
    # ===================
    # MAIN EFFECTS MODEL
    # ===================
    model_main = LinearRegression()
    model_main.fit(X_main, y)
    y_pred_main = model_main.predict(X_main)
    r2_main = r2_score(y, y_pred_main)
    
    main_effects_results[behavior] = {
        'model': model_main,
        'r2': r2_main,
        'coefficients': model_main.coef_,
        'intercept': model_main.intercept_,
        'predictions': y_pred_main
    }
    
    # ===================
    # INTERACTION MODEL
    # ===================
    model_interaction = LinearRegression()
    model_interaction.fit(X_interaction, y)
    y_pred_interaction = model_interaction.predict(X_interaction)
    r2_interaction = r2_score(y, y_pred_interaction)
    
    interaction_results[behavior] = {
        'model': model_interaction,
        'r2': r2_interaction,
        'coefficients': model_interaction.coef_,
        'intercept': model_interaction.intercept_,
        'predictions': y_pred_interaction
    }
    
    # ===================
    # F-TEST FOR INTERACTION SIGNIFICANCE
    # ===================
    # Calculate F-statistic to test if interaction model is significantly better
    n = len(y)
    
    # Residual Sum of Squares
    rss_main = np.sum((y - y_pred_main)**2)
    rss_interaction = np.sum((y - y_pred_interaction)**2)
    
    # Degrees of freedom
    df_main = n - X_main.shape[1] - 1  # n - p - 1
    df_interaction = n - X_interaction.shape[1] - 1
    
    # F-statistic for interaction term
    f_stat = ((rss_main - rss_interaction) / 1) / (rss_interaction / df_interaction)
    p_value = 1 - stats.f.cdf(f_stat, 1, df_interaction)
    
    # Store comparison results
    model_comparison[behavior] = {
        'r2_improvement': r2_interaction - r2_main,
        'f_statistic': f_stat,
        'p_value': p_value,
        'interaction_significant': p_value < 0.05
    }
    
    # ===================
    # DISPLAY RESULTS
    # ===================
    print(f"\nMAIN EFFECTS MODEL:")
    print(f"  R² = {r2_main:.4f}")
    print(f"  Intercept: {model_main.intercept_:.4f}")
    print(f"  Habitat coef: {model_main.coef_[0]:.4f}")
    print(f"  Herd Size coef: {model_main.coef_[1]:.4f}")
    
    print(f"\nINTERACTION MODEL:")
    print(f"  R² = {r2_interaction:.4f}")
    print(f"  Intercept: {model_interaction.intercept_:.4f}")
    print(f"  Habitat coef: {model_interaction.coef_[0]:.4f}")
    print(f"  Herd Size coef: {model_interaction.coef_[1]:.4f}")
    print(f"  Interaction coef: {model_interaction.coef_[2]:.4f}")
    
    print(f"\nMODEL COMPARISON:")
    print(f"  R² improvement: {r2_interaction - r2_main:.4f}")
    print(f"  F-statistic: {f_stat:.3f}")
    print(f"  p-value: {p_value:.4f}")
    
    significance = ""
    if p_value < 0.001:
        significance = "*** (highly significant)"
    elif p_value < 0.01:
        significance = "** (very significant)"
    elif p_value < 0.05:
        significance = "* (significant)"
    else:
        significance = "(not significant)"
    
    print(f"  Interaction effect: {significance}")

============================================================
MODEL COMPARISON: MAIN EFFECTS vs INTERACTION MODELS
============================================================

--------------------------------------------------
BEHAVIOR: Walk
--------------------------------------------------

MAIN EFFECTS MODEL:
  R² = 0.4022
  Intercept: 0.0521
  Habitat coef: 0.0868
  Herd Size coef: 0.3066

INTERACTION MODEL:
  R² = 0.4084
  Intercept: 0.0409
  Habitat coef: 0.1074
  Herd Size coef: 0.3782
  Interaction coef: -0.0978

MODEL COMPARISON:
  R² improvement: 0.0062
  F-statistic: 0.818
  p-value: 0.3685
  Interaction effect: (not significant)

--------------------------------------------------
BEHAVIOR: Head Up
--------------------------------------------------

MAIN EFFECTS MODEL:
  R² = 0.2500
  Intercept: 0.1584
  Habitat coef: -0.0084
  Herd Size coef: 0.3213

INTERACTION MODEL:
  R² = 0.2546
  Intercept: 0.1699
  Habitat coef: -0.0296
  Herd Size coef: 0.2478
  Interaction coef: 0.1004

MODEL COMPARISON:
  R² improvement: 0.0046
  F-statistic: 0.482
  p-value: 0.4894
  Interaction effect: (not significant)

--------------------------------------------------
BEHAVIOR: Graze
--------------------------------------------------

MAIN EFFECTS MODEL:
  R² = 0.5621
  Intercept: 0.7150
  Habitat coef: -0.0757
  Herd Size coef: -0.6408

INTERACTION MODEL:
  R² = 0.5628
  Intercept: 0.7215
  Habitat coef: -0.0877
  Herd Size coef: -0.6824
  Interaction coef: 0.0569

MODEL COMPARISON:
  R² improvement: 0.0008
  F-statistic: 0.137
  p-value: 0.7120
  Interaction effect: (not significant)

--------------------------------------------------
BEHAVIOR: Other
--------------------------------------------------

MAIN EFFECTS MODEL:
  R² = 0.0016
  Intercept: 0.0745
  Habitat coef: -0.0027
  Herd Size coef: 0.0129

INTERACTION MODEL:
  R² = 0.0082
  Intercept: 0.0677
  Habitat coef: 0.0099
  Herd Size coef: 0.0565
  Interaction coef: -0.0595

MODEL COMPARISON:
  R² improvement: 0.0066
  F-statistic: 0.520
  p-value: 0.4730
  Interaction effect: (not significant)

In [12]:

Copied!





# ============================================================================
# 3. VISUALIZATION OF INTERACTION EFFECTS
# ============================================================================

# Create comprehensive interaction plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Interaction Effects: Predicted vs Observed Behavior by Habitat and Herd Size', 
             fontsize=16, y=0.98)

for i, behavior in enumerate(behaviors):
    ax = axes[i//2, i%2]
    
    # Create separate data for each combination
    combinations = [
        ('Closed', 'Large'), ('Closed', 'Small'), 
        ('Open', 'Large'), ('Open', 'Small')
    ]
    
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
    
    # Plot observed data points
    for j, (habitat, herd_size) in enumerate(combinations):
        mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
        if mask.any():
            observed_values = df[mask][behavior]
            x_pos = j + np.random.normal(0, 0.05, size=len(observed_values))  # Add jitter
            ax.scatter(x_pos, observed_values, alpha=0.6, color=colors[j], 
                      s=30, label=f'{habitat}+{herd_size} (observed)')
    
    # Plot model predictions
    for j, (habitat, herd_size) in enumerate(combinations):
        mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
        if mask.any():
            # Main effects prediction
            main_pred = main_effects_results[behavior]['predictions'][mask].mean()
            # Interaction model prediction
            interaction_pred = interaction_results[behavior]['predictions'][mask].mean()
            
            # Plot predictions as horizontal lines
            ax.hlines(main_pred, j-0.3, j+0.3, colors='red', linestyles='--', 
                     linewidth=2, alpha=0.8)
            ax.hlines(interaction_pred, j-0.2, j+0.2, colors='blue', linestyles='-', 
                     linewidth=3, alpha=0.8)
    
    ax.set_title(f'{behavior}\nInteraction p-value: {model_comparison[behavior]["p_value"]:.4f}')
    ax.set_ylabel('Proportion of Time')
    ax.set_xticks(range(4))
    ax.set_xticklabels([f'{h}\n{hs}' for h, hs in combinations], rotation=0)
    
    # Add legend for first subplot only
    if i == 0:
        from matplotlib.lines import Line2D
        legend_elements = [
            Line2D([0], [0], linestyle='--', color='red', label='Main Effects Model'),
            Line2D([0], [0], linestyle='-', color='blue', label='Interaction Model'),
            Line2D([0], [0], marker='o', color='gray', linestyle='', label='Observed Data')
        ]
        ax.legend(handles=legend_elements, loc='upper right', fontsize=8)

plt.tight_layout()
plt.show()
# ============================================================================
# 3. VISUALIZATION OF INTERACTION EFFECTS
# ============================================================================

# Create comprehensive interaction plots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Interaction Effects: Predicted vs Observed Behavior by Habitat and Herd Size', 
             fontsize=16, y=0.98)

for i, behavior in enumerate(behaviors):
    ax = axes[i//2, i%2]
    
    # Create separate data for each combination
    combinations = [
        ('Closed', 'Large'), ('Closed', 'Small'), 
        ('Open', 'Large'), ('Open', 'Small')
    ]
    
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
    
    # Plot observed data points
    for j, (habitat, herd_size) in enumerate(combinations):
        mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
        if mask.any():
            observed_values = df[mask][behavior]
            x_pos = j + np.random.normal(0, 0.05, size=len(observed_values))  # Add jitter
            ax.scatter(x_pos, observed_values, alpha=0.6, color=colors[j], 
                      s=30, label=f'{habitat}+{herd_size} (observed)')
    
    # Plot model predictions
    for j, (habitat, herd_size) in enumerate(combinations):
        mask = (df['Habitat'] == habitat) & (df['Herd Size'] == herd_size)
        if mask.any():
            # Main effects prediction
            main_pred = main_effects_results[behavior]['predictions'][mask].mean()
            # Interaction model prediction
            interaction_pred = interaction_results[behavior]['predictions'][mask].mean()
            
            # Plot predictions as horizontal lines
            ax.hlines(main_pred, j-0.3, j+0.3, colors='red', linestyles='--', 
                     linewidth=2, alpha=0.8)
            ax.hlines(interaction_pred, j-0.2, j+0.2, colors='blue', linestyles='-', 
                     linewidth=3, alpha=0.8)
    
    ax.set_title(f'{behavior}\nInteraction p-value: {model_comparison[behavior]["p_value"]:.4f}')
    ax.set_ylabel('Proportion of Time')
    ax.set_xticks(range(4))
    ax.set_xticklabels([f'{h}\n{hs}' for h, hs in combinations], rotation=0)
    
    # Add legend for first subplot only
    if i == 0:
        from matplotlib.lines import Line2D
        legend_elements = [
            Line2D([0], [0], linestyle='--', color='red', label='Main Effects Model'),
            Line2D([0], [0], linestyle='-', color='blue', label='Interaction Model'),
            Line2D([0], [0], marker='o', color='gray', linestyle='', label='Observed Data')
        ]
        ax.legend(handles=legend_elements, loc='upper right', fontsize=8)

plt.tight_layout()
plt.show()

In [13]:

Copied!





# ============================================================================
# 4. DETAILED INTERACTION INTERPRETATION
# ============================================================================

print("\n" + "="*60)
print("INTERACTION EFFECTS INTERPRETATION")
print("="*60)

# Calculate predicted values for each combination using interaction model
print("\nPredicted values for each combination (Interaction Model):")
print("-" * 55)

combination_predictions = {}

for behavior in behaviors:
    print(f"\n{behavior}:")
    model = interaction_results[behavior]['model']
    intercept = model.intercept_
    habitat_coef = model.coef_[0]
    herdsize_coef = model.coef_[1]
    interaction_coef = model.coef_[2]
    
    # Calculate predictions for each combination
    closed_large = intercept  # 0 + 0 + 0
    open_large = intercept + habitat_coef  # 1 + 0 + 0
    closed_small = intercept + herdsize_coef  # 0 + 1 + 0
    open_small = intercept + habitat_coef + herdsize_coef + interaction_coef  # 1 + 1 + 1
    
    combination_predictions[behavior] = {
        'Closed+Large': closed_large,
        'Open+Large': open_large,
        'Closed+Small': closed_small,
        'Open+Small': open_small
    }
    
    print(f"  Closed + Large:  {closed_large:.4f}")
    print(f"  Open + Large:    {open_large:.4f}")
    print(f"  Closed + Small:  {closed_small:.4f}")
    print(f"  Open + Small:    {open_small:.4f}")
    
    # Calculate the interaction effect
    # Interaction = (Open+Small) - (Open+Large) - (Closed+Small) + (Closed+Large)
    interaction_effect = open_small - open_large - closed_small + closed_large
    print(f"  Interaction effect: {interaction_effect:.4f}")
    
    if abs(interaction_effect) > 0.01 and model_comparison[behavior]['interaction_significant']:
        if interaction_effect > 0:
            print("  → Open habitat effect is STRONGER in small herds")
        else:
            print("  → Open habitat effect is WEAKER in small herds")
# ============================================================================
# 4. DETAILED INTERACTION INTERPRETATION
# ============================================================================

print("\n" + "="*60)
print("INTERACTION EFFECTS INTERPRETATION")
print("="*60)

# Calculate predicted values for each combination using interaction model
print("\nPredicted values for each combination (Interaction Model):")
print("-" * 55)

combination_predictions = {}

for behavior in behaviors:
    print(f"\n{behavior}:")
    model = interaction_results[behavior]['model']
    intercept = model.intercept_
    habitat_coef = model.coef_[0]
    herdsize_coef = model.coef_[1]
    interaction_coef = model.coef_[2]
    
    # Calculate predictions for each combination
    closed_large = intercept  # 0 + 0 + 0
    open_large = intercept + habitat_coef  # 1 + 0 + 0
    closed_small = intercept + herdsize_coef  # 0 + 1 + 0
    open_small = intercept + habitat_coef + herdsize_coef + interaction_coef  # 1 + 1 + 1
    
    combination_predictions[behavior] = {
        'Closed+Large': closed_large,
        'Open+Large': open_large,
        'Closed+Small': closed_small,
        'Open+Small': open_small
    }
    
    print(f"  Closed + Large:  {closed_large:.4f}")
    print(f"  Open + Large:    {open_large:.4f}")
    print(f"  Closed + Small:  {closed_small:.4f}")
    print(f"  Open + Small:    {open_small:.4f}")
    
    # Calculate the interaction effect
    # Interaction = (Open+Small) - (Open+Large) - (Closed+Small) + (Closed+Large)
    interaction_effect = open_small - open_large - closed_small + closed_large
    print(f"  Interaction effect: {interaction_effect:.4f}")
    
    if abs(interaction_effect) > 0.01 and model_comparison[behavior]['interaction_significant']:
        if interaction_effect > 0:
            print("  → Open habitat effect is STRONGER in small herds")
        else:
            print("  → Open habitat effect is WEAKER in small herds")

============================================================
INTERACTION EFFECTS INTERPRETATION
============================================================

Predicted values for each combination (Interaction Model):
-------------------------------------------------------

Walk:
  Closed + Large:  0.0409
  Open + Large:    0.1483
  Closed + Small:  0.4191
  Open + Small:    0.4287
  Interaction effect: -0.0978

Head Up:
  Closed + Large:  0.1699
  Open + Large:    0.1403
  Closed + Small:  0.4177
  Open + Small:    0.4885
  Interaction effect: 0.1004

Graze:
  Closed + Large:  0.7215
  Open + Large:    0.6337
  Closed + Small:  0.0390
  Open + Small:    0.0082
  Interaction effect: 0.0569

Other:
  Closed + Large:  0.0677
  Open + Large:    0.0776
  Closed + Small:  0.1242
  Open + Small:    0.0746
  Interaction effect: -0.0595

In [14]:

Copied!





# ============================================================================
# 5. SUMMARY COMPARISON TABLE
# ============================================================================

print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)

summary_comparison = pd.DataFrame({
    'Behavior': behaviors,
    'Main_R2': [main_effects_results[b]['r2'] for b in behaviors],
    'Interaction_R2': [interaction_results[b]['r2'] for b in behaviors],
    'R2_Improvement': [model_comparison[b]['r2_improvement'] for b in behaviors],
    'F_Statistic': [model_comparison[b]['f_statistic'] for b in behaviors],
    'P_Value': [model_comparison[b]['p_value'] for b in behaviors],
    'Interaction_Significant': [model_comparison[b]['interaction_significant'] for b in behaviors]
})

print(summary_comparison.round(4))

# Count significant interactions
significant_count = sum(summary_comparison['Interaction_Significant'])
print(f"\nNumber of behaviors with significant interactions: {significant_count}/4")

if significant_count > 0:
    print("\nBehaviors with significant interactions:")
    significant_behaviors = summary_comparison[summary_comparison['Interaction_Significant']]['Behavior'].tolist()
    for behavior in significant_behaviors:
        p_val = model_comparison[behavior]['p_value']
        r2_imp = model_comparison[behavior]['r2_improvement']
        print(f"  - {behavior}: p = {p_val:.4f}, R² improvement = {r2_imp:.4f}")

# ============================================================================
# 6. RECOMMENDATION
# ============================================================================

print("\n" + "="*60)
print("RECOMMENDATION")
print("="*60)

if significant_count > 0:
    print("✅ INTERACTION EFFECTS DETECTED!")
    print("\nRecommendation: Use the interaction model for behaviors with significant")
    print("interaction terms. The habitat and herd size effects are not simply additive")
    print("- their combined effect differs from what you'd expect by just adding them up.")
    print("\nThis suggests that:")
    print("- The effect of habitat depends on herd size (or vice versa)")
    print("- Certain combinations have unique behavioral responses")
else:
    print("❌ NO SIGNIFICANT INTERACTIONS DETECTED")
    print("\nRecommendation: Stick with the main effects model. The habitat and")
    print("herd size effects appear to be additive - you can predict the combined")
    print("effect by simply adding the individual effects together.")

# Save detailed results
summary_comparison.to_csv('interaction_model_comparison.csv', index=False)
print(f"\nDetailed results saved to 'interaction_model_comparison.csv'")

print("\n" + "="*60)
print("INTERACTION ANALYSIS COMPLETE")
print("="*60)
# ============================================================================
# 5. SUMMARY COMPARISON TABLE
# ============================================================================

print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)

summary_comparison = pd.DataFrame({
    'Behavior': behaviors,
    'Main_R2': [main_effects_results[b]['r2'] for b in behaviors],
    'Interaction_R2': [interaction_results[b]['r2'] for b in behaviors],
    'R2_Improvement': [model_comparison[b]['r2_improvement'] for b in behaviors],
    'F_Statistic': [model_comparison[b]['f_statistic'] for b in behaviors],
    'P_Value': [model_comparison[b]['p_value'] for b in behaviors],
    'Interaction_Significant': [model_comparison[b]['interaction_significant'] for b in behaviors]
})

print(summary_comparison.round(4))

# Count significant interactions
significant_count = sum(summary_comparison['Interaction_Significant'])
print(f"\nNumber of behaviors with significant interactions: {significant_count}/4")

if significant_count > 0:
    print("\nBehaviors with significant interactions:")
    significant_behaviors = summary_comparison[summary_comparison['Interaction_Significant']]['Behavior'].tolist()
    for behavior in significant_behaviors:
        p_val = model_comparison[behavior]['p_value']
        r2_imp = model_comparison[behavior]['r2_improvement']
        print(f"  - {behavior}: p = {p_val:.4f}, R² improvement = {r2_imp:.4f}")

# ============================================================================
# 6. RECOMMENDATION
# ============================================================================

print("\n" + "="*60)
print("RECOMMENDATION")
print("="*60)

if significant_count > 0:
    print("✅ INTERACTION EFFECTS DETECTED!")
    print("\nRecommendation: Use the interaction model for behaviors with significant")
    print("interaction terms. The habitat and herd size effects are not simply additive")
    print("- their combined effect differs from what you'd expect by just adding them up.")
    print("\nThis suggests that:")
    print("- The effect of habitat depends on herd size (or vice versa)")
    print("- Certain combinations have unique behavioral responses")
else:
    print("❌ NO SIGNIFICANT INTERACTIONS DETECTED")
    print("\nRecommendation: Stick with the main effects model. The habitat and")
    print("herd size effects appear to be additive - you can predict the combined")
    print("effect by simply adding the individual effects together.")

# Save detailed results
summary_comparison.to_csv('interaction_model_comparison.csv', index=False)
print(f"\nDetailed results saved to 'interaction_model_comparison.csv'")

print("\n" + "="*60)
print("INTERACTION ANALYSIS COMPLETE")
print("="*60)

============================================================
MODEL COMPARISON SUMMARY
============================================================
  Behavior  Main_R2  Interaction_R2  R2_Improvement  F_Statistic  P_Value  \
0     Walk   0.4022          0.4084          0.0062       0.8181   0.3685   
1  Head Up   0.2500          0.2546          0.0046       0.4824   0.4894   
2    Graze   0.5621          0.5628          0.0008       0.1373   0.7120   
3    Other   0.0016          0.0082          0.0066       0.5200   0.4730   

   Interaction_Significant  
0                    False  
1                    False  
2                    False  
3                    False  

Number of behaviors with significant interactions: 0/4

============================================================
RECOMMENDATION
============================================================
❌ NO SIGNIFICANT INTERACTIONS DETECTED

Recommendation: Stick with the main effects model. The habitat and
herd size effects appear to be additive - you can predict the combined
effect by simply adding the individual effects together.

Detailed results saved to 'interaction_model_comparison.csv'

============================================================
INTERACTION ANALYSIS COMPLETE
============================================================