Hide code cell source
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import shap

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity.tsv', sep='\t', on_bad_lines='skip')
data_detail = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity_detail.tsv', sep='\t',on_bad_lines='skip')
data_assessment = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv', sep='\t', on_bad_lines='skip')
data_assessment['school_year'] = data_assessment['script_name'].str[-4:]
Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/var/folders/0y/_j87n5jn4j7grzdsq8lvjk_80000gn/T/ipykernel_94561/3920765610.py in <cell line: 23>()
     21 warnings.filterwarnings('ignore')
     22 
---> 23 data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity.tsv', sep='\t', on_bad_lines='skip')
     24 data_detail = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity_detail.tsv', sep='\t',on_bad_lines='skip')
     25 data_assessment = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv', sep='\t', on_bad_lines='skip')

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    910     kwds.update(kwds_defaults)
    911 
--> 912     return _read(filepath_or_buffer, kwds)
    913 
    914 

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    575 
    576     # Create the parser.
--> 577     parser = TextFileReader(filepath_or_buffer, **kwds)
    578 
    579     if chunksize or iterator:

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
   1405 
   1406         self.handles: IOHandles | None = None
-> 1407         self._engine = self._make_engine(f, self.engine)
   1408 
   1409     def close(self) -> None:

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _make_engine(self, f, engine)
   1659                 if "b" not in mode:
   1660                     mode += "b"
-> 1661             self.handles = get_handle(
   1662                 f,
   1663                 mode,

~/Library/Python/3.9/lib/python/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    857         if ioargs.encoding and "b" not in ioargs.mode:
    858             # Encoding
--> 859             handle = open(
    860                 handle,
    861                 ioargs.mode,

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity.tsv'

5.3 Predictive Modeling of Assessment Performance#

  1. Feature Engineering: Created new features implementation_level, first_year_of_teaching, child_level_type, and learning profile labels.

  2. Model Training and Evaluation: Leveraged linear regression and machine learning techniqes to predict student assessment scores. Calculated the MSE and R-Squared to evaluate the performance of the model.

  3. Feature Importance Analysis: Interpreted the feature importance of the models to understand the relationship between the predictors and the target variable.

Feature Engineering#

Three features were generated from the raw data:

implementation_level: Categorized teachers into 3 levels (low, medium, high) based on their cumulative activity over time using K-Means clustering. This helps differentiate teachers based on their experience levels.

first_year_of_teaching: Indicated whether a teacher is teaching a particular course for the first time. This captures how teacher proficiency changes over time.

child_level_type: Rpepresented the quiz item type, can be Multi (single-select multi choice), Multi2 (choose-two multi-choice), and Match (matching items)

Teacher Implementation Level Clustering#

This cluster_teachers_by_implementation function processes data from activity dataset to categorize teachers based on their cumulative implementation levels over various school years.

The implementation levels are determined by metrics such as the cumulative unique number of scripts assigned and the cumulative unique number of students they’ve interacted with up to a particular school year.

Hide code cell source
def compute_cumulative_unique(series):
    cumulative_list = []
    seen = set()
    for s in series:
        seen.update(s)
        cumulative_list.append(len(seen))
    return cumulative_list

def cluster_teachers_by_implementation(data):
    # Step 1: Preprocess the Data
    data_sorted = data.sort_values(by=['teacher_user_id', 'school_year'])

    # Step 2: Compute cumulative unique values for scripts and students
    cumulative_data = data_sorted.groupby(['teacher_user_id', 'school_year']).agg({
        'script_id': 'unique',
        'student_user_id': 'unique'
    }).reset_index()
    cumulative_data['cumulative_scripts'] = cumulative_data.groupby('teacher_user_id')['script_id'].transform(compute_cumulative_unique)
    cumulative_data['cumulative_students'] = cumulative_data.groupby('teacher_user_id')['student_user_id'].transform(compute_cumulative_unique)

    # Step 3: Scale the Metrics
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(cumulative_data[['cumulative_scripts', 'cumulative_students']])

    # Step 4: Apply KMeans Clustering
    kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
    cumulative_data['cluster_id'] = kmeans.fit_predict(scaled_data)

    # Step 5: Convert cluster IDs to labels
    cluster_labels = {0: 'Low', 2: 'Medium', 1: 'High'}
    cumulative_data['implementation_level'] = cumulative_data['cluster_id'].map(cluster_labels)
    cluster_membership_df = cumulative_data[['teacher_user_id', 'school_year', 'implementation_level']]
    cluster_membership_df['school_year'] = cluster_membership_df['school_year'].str[:4]
    
    return cluster_membership_df

implementation = cluster_teachers_by_implementation(data)
df_merged = pd.merge(data_assessment, implementation, on=['teacher_user_id', 'school_year'], how='left')

Question Type and Teacher Experience Features#

Previous descriptive analysis on assessment dataset has shown that correctness vary on course and question type, so here we extracted the feature child_level_type to control for its impact.

Hide code cell source
# Calculate the difficulty coefficient for each course and parent level
difficulty_coefficients = data_assessment.groupby(['course_name', 'parent_level_type','child_level_type'])['answer_correct_flag'].apply(lambda x: (x == 'Y').mean()).reset_index()
difficulty_coefficients.columns = ['course_name', 'parent_level_type','child_level_type', 'difficulty_coefficient']
df_merged = pd.merge(df_merged, difficulty_coefficients, on=['course_name','parent_level_type','child_level_type',], how='left')

# Calculate the first year each teacher taught this course
first_year_taught = df_merged.groupby('teacher_user_id')['school_year'].min().reset_index()
first_year_taught.columns = ['teacher_user_id', 'first_year_taught']
df_merged = pd.merge(df_merged, first_year_taught, on='teacher_user_id', how='left')
# Create a new column to indicate whether the current year is the first year the teacher taught this course
df_merged['first_year_of_teaching'] = (df_merged['school_year'] == df_merged['first_year_taught']).astype(int) #1 represents is first year

# Replace 'Y' and 'N' in 'answer_correct_flag' with 1 and 0 respectively
df_merged['answer_correct_flag'] = df_merged['answer_correct_flag'].map({'Y': 1, 'N': 0})
df_model = df_merged[['teacher_user_id','course_name','school_year','parent_level_type','child_level_type','student_user_id','implementation_level','difficulty_coefficient','first_year_of_teaching','answer_correct_flag']]
df_model['difficulty_coefficient'] = 1 - df_model['difficulty_coefficient']

Student Learning Behavior Features#

Hide code cell source
# Calculating the time gap between created_at and updated_at timestamps
data_detail['created_at'] = pd.to_datetime(data_detail['created_at'])
data_detail['updated_at'] = pd.to_datetime(data_detail['updated_at'])
data_detail['time_gap'] = (data_detail['updated_at'] - data_detail['created_at']).dt.total_seconds() 
# Filtering the data based on the given criteria
filtered_data = data_detail[(data_detail['attempts'] != 0) & 
                     (data_detail['time_gap'] > 0) & 
                     (data_detail['time_gap'] <= 60*15)]

# Categorizing the test results based on provided breakdown
def categorize_test_result(result):
    if result < 20 and result != -1:
        return 'Failed'
    elif 20 <= result < 30:
        return 'Pass (Not Optimal)'
    elif 30 <= result < 1000:
        return 'Optimal'
    else:
        return 'Special'

filtered_data['test_result_category'] =filtered_data['best_result'].apply(categorize_test_result)
"""
# Identify Learning Behaviors for the filtered data
def assign_behavior_label(row):
    # Check for Rapid Guessing first
    if (row['attempts'] > 1) and (row['time_gap'] / row['attempts'] <= 5) and (row['test_result_category'] in ['Optimal', 'Pass (Not Optimal)']):
        return "Rapid Guessing"
    elif (row['attempts'] == 1) and (row['test_result_category'] == 'Failed'):
        return "One-shot Failures"
    elif (row['attempts'] > 3) and (row['test_result_category'] == 'Failed'):
        return "Strugglers"
    elif row['test_result_category'] == 'Pass (Not Optimal)':
        return "Suboptimal Success"
    elif (row['attempts'] == 1) and (row['test_result_category'] == 'Optimal'):
        return "Succeed at First Trial"
    elif (row['attempts'] > 3) and (row['time_gap'] > 60) and (row['test_result_category'] == 'Optimal'):
        return "Consistent Learner"
    else:
        return "Others"
"""  
def assign_behavior_label(row):
    # Check for Rapid Guessing first
    if (row['attempts'] > 1) and (row['time_gap'] / row['attempts'] <= 5) and (row['test_result_category'] in ['Optimal', 'Pass (Not Optimal)']):
        return "Rapid Guessing"
    elif (row['attempts'] == 1) and (row['test_result_category'] == 'Failed'):
        return "One-shot Failures"
    elif (row['attempts'] > 3) and (row['test_result_category'] == 'Failed'):
        return "Strugglers"
    elif row['test_result_category'] == 'Pass (Not Optimal)':
        return "Suboptimal Success"
    elif (row['attempts'] == 1) and (row['test_result_category'] == 'Optimal'):
        return "Succeed at First Trial"
    elif (row['attempts'] >= 3) and (row['time_gap'] > 59) and (row['test_result_category'] == 'Optimal'): #both over 50%
        return "Consistent Learner"
    else:
        return "Others"
    
filtered_data['behavior_label'] = filtered_data.apply(assign_behavior_label, axis=1)

# Counting occurrences of each label
behavior_counts = filtered_data['behavior_label'].value_counts()

# convert counts to proportion to standardize the data
grouped_data = filtered_data.groupby(['student_user_id', 'school_year', 'behavior_label']).size().unstack().reset_index().fillna(0)
grouped_data.iloc[:, 2:] = grouped_data.iloc[:, 2:].div(grouped_data.iloc[:, 2:].sum(axis=1), axis=0)

# Standardizing the data
scaler = StandardScaler()
grouped_data.iloc[:, 2:] = scaler.fit_transform(grouped_data.iloc[:, 2:])
grouped_data['school_year'] = grouped_data['school_year'].str[:4]
df_model = pd.merge(df_model, grouped_data, on=['student_user_id','school_year'], how='left')
df_model.head()
teacher_user_id course_name school_year parent_level_type child_level_type student_user_id implementation_level difficulty_coefficient first_year_of_teaching answer_correct_flag Consistent Learner One-shot Failures Others Rapid Guessing Strugglers Suboptimal Success Succeed at First Trial
0 24529224 csd 2017 Gamelab Multi 33728229 High 0.64523 1 0.0 -0.945497 -0.026028 0.655204 0.984262 -0.235071 -0.450502 -0.30247
1 24529224 csd 2017 Gamelab Multi 33718947 High 0.64523 1 1.0 -1.047306 -0.026028 0.668506 -0.483908 1.368678 -0.450502 -0.30247
2 24529224 csd 2017 Gamelab Multi 29432841 High 0.64523 1 0.0 -0.679255 -0.026028 0.960175 0.238762 -0.466998 -0.450502 -0.30247
3 24529224 csd 2017 Gamelab Multi 33752236 High 0.64523 1 0.0 -0.993994 -0.026028 1.486539 -0.248622 -0.466998 -0.450502 -0.30247
4 24529224 csd 2017 Gamelab Multi 33752305 High 0.64523 1 0.0 -0.868710 -0.026028 1.591301 -0.636637 -0.466998 -0.450502 -0.30247

Model Training and Evaluation#

  1. Split the data into training and testing sets.

  2. Train Linear Regression, Random Forest, Gradient Boosting, and Neural Network models.

  3. Evaluate the models on the test set using metrics like MAE, MS, RMSE and R-Squared.

Model Selection#

Model

Description

Characteristics

Linear Regression

Models the linear relationship between the dependent and independent variables.

Adv: Simple, interpretable.
Disadv: Assumes linear relationships; Sensitive to outliers.

Random Forest Regressor

Ensemble method using multiple decision trees.

Adv: Handles linear/non-linear data; Less prone to overfitting.
Disadv: Can be computationally intensive.

Gradient Boosting Regressor

Boosting technique that builds trees sequentially.

Adv: High predictive accuracy; Captures subtle patterns.
Disadv: Requires tuning; Can overfit noisy data.

Support Vector Regression (SVR)

Regression technique to find hyperplanes in high dimensional space.

Adv: Effective in high-dimensional spaces; Models non-linear relationships.
Disadv: Needs data scaling; Kernel and parameter choice crucial.

Ridge Regression

Linear regression with L2 regularization.

Adv: Counters multicollinearity; Prevents overfitting.
Disadv: Might not provide feature selection.

Lasso Regression

Linear regression with L1 regularization.

Adv: Can result in feature selection.
Disadv: Might underfit with excessive regularization.

Neural Network

Comprises layers of interconnected nodes.

Adv: Models complex relationships; Flexible.
Disadv: Needs more data; Black-box nature; Computationally intensive.

Hide code cell source
data_cleaned = df_model.dropna()

# Feature Engineering: Aggregate answer_correct_flag
aggregated_data = data_cleaned.groupby(['teacher_user_id', 'course_name', 'school_year', 
                                        'parent_level_type', 'child_level_type', 'student_user_id'])['answer_correct_flag'].mean().reset_index()
aggregated_data.rename(columns={'answer_correct_flag': 'avg_answer_correct'}, inplace=True)

# Merging aggregated_data with data_cleaned
merged_data = pd.merge(aggregated_data, data_cleaned.drop(columns=['answer_correct_flag']), 
                       on=['teacher_user_id', 'course_name', 'school_year', 'parent_level_type', 
                           'child_level_type', 'student_user_id'], 
                       how='left').drop_duplicates()

# Encode categorical variables
cols_to_drop = ['teacher_user_id', 'course_name', 'school_year', 'parent_level_type', 'student_user_id', 'avg_answer_correct','difficulty_coefficient','Others']
merged_data_encoded = pd.get_dummies(merged_data.drop(columns=cols_to_drop), columns=['implementation_level', 'child_level_type'])

X = merged_data_encoded
y = merged_data['avg_answer_correct']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)

Model Performance#

Hide code cell source
# Train regression models
linear_reg = LinearRegression().fit(X_train, y_train)
rf_reg = RandomForestRegressor(random_state=36).fit(X_train, y_train)
gbm_reg = GradientBoostingRegressor(random_state=36).fit(X_train, y_train)
svr_reg = SVR().fit(X_train, y_train)
ridge_reg = Ridge().fit(X_train, y_train)
lasso_reg = Lasso().fit(X_train, y_train)
nn_reg = MLPRegressor(max_iter=1000, random_state=36).fit(X_train, y_train)

linear_preds = linear_reg.predict(X_test)
rf_preds = rf_reg.predict(X_test)
gbm_preds = gbm_reg.predict(X_test)
svr_preds = svr_reg.predict(X_test)
ridge_preds = ridge_reg.predict(X_test)
lasso_preds = lasso_reg.predict(X_test)
nn_preds = nn_reg.predict(X_test)

regression_metrics = {
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting', 'Support Vector Regression', 'Ridge Regression', 'Lasso Regression', 'Neural Network'],
    'MAE': [mean_absolute_error(y_test, linear_preds), mean_absolute_error(y_test, rf_preds), mean_absolute_error(y_test, gbm_preds), mean_absolute_error(y_test, svr_preds), mean_absolute_error(y_test, ridge_preds), mean_absolute_error(y_test, lasso_preds), mean_absolute_error(y_test, nn_preds)],
    'MSE': [mean_squared_error(y_test, linear_preds), mean_squared_error(y_test, rf_preds), mean_squared_error(y_test, gbm_preds), mean_squared_error(y_test, svr_preds), mean_squared_error(y_test, ridge_preds), mean_squared_error(y_test, lasso_preds), mean_squared_error(y_test, nn_preds)],
    'RMSE': [mean_squared_error(y_test, linear_preds, squared=False), mean_squared_error(y_test, rf_preds, squared=False), mean_squared_error(y_test, gbm_preds, squared=False), mean_squared_error(y_test, svr_preds, squared=False),mean_squared_error(y_test, ridge_preds, squared=False), mean_squared_error(y_test, lasso_preds, squared=False), mean_squared_error(y_test, nn_preds, squared=False)],
    'R-squared': [r2_score(y_test, linear_preds), r2_score(y_test, rf_preds), r2_score(y_test, gbm_preds), r2_score(y_test, svr_preds),r2_score(y_test, ridge_preds), r2_score(y_test, lasso_preds), r2_score(y_test, nn_preds)]
}

performance_df_reg = pd.DataFrame(regression_metrics)
performance_df_reg
Model MAE MSE RMSE R-squared
0 Linear Regression 0.312055 0.134970 0.367383 0.167158
1 Random Forest 0.295157 0.141233 0.375809 0.128515
2 Gradient Boosting 0.284917 0.121339 0.348338 0.251268
3 Support Vector Regression 0.280131 0.135936 0.368695 0.161196
4 Ridge Regression 0.312097 0.134972 0.367386 0.167144
5 Lasso Regression 0.351759 0.162388 0.402974 -0.002027
6 Neural Network 0.285009 0.126396 0.355522 0.220067

Gradient Boosting and Neural Network models demonstrate superior performance compared to the baseline Linear Regression model for this dataset.

Feature Importance Analysis#

Feature importance plots for Gradient Boosting and Neural Network are presetned since they are the two best-performaing models.

Feature Importance Plots#

gb_feature_importance = gbm_reg.feature_importances_
# Extracting feature importance for Neural Network (using permutation importance)
nn_perm_importance = permutation_importance(nn_reg, X_test, y_test, n_repeats=30, random_state=36)
nn_feature_importance = nn_perm_importance.importances_mean

# Creating DataFrames for visualization
gb_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': gb_feature_importance
}).sort_values(by='Importance', ascending=False)

nn_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': nn_feature_importance
}).sort_values(by='Importance', ascending=False)

# Visualizing the feature importances
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(10, 6))

# Gradient Boosting feature importance
sns.barplot(x='Importance', y='Feature', data=gb_importance_df.head(8), ax=axes[0], palette='viridis')
axes[0].set_title('Top 8 Feature Importances from Gradient Boosting')
axes[0].set_xlabel('Importance Score')
axes[0].set_ylabel('Features')

# Neural Network feature importance
sns.barplot(x='Importance', y='Feature', data=nn_importance_df.head(8), ax=axes[1], palette='viridis')
axes[1].set_title('Top 8 Feature Importances from Neural Network (Permutation Importance)')
axes[1].set_xlabel('Importance Score')
axes[1].set_ylabel('Features')

plt.tight_layout()
plt.show()
_images/b36a7430c24ef02be813f931e06e487cfd73b2a25b594b59a6a8efa1041c402f.png

Visualizing Gradient Boosting Importance: SHAP#

SHAP (SHapley Additive exPlanations) values provide a unified measure of feature importance and effects, taking into account the interaction effects among features. It gives a value for each feature for each prediction, indicating the direction and magnitude of the feature’s effect.

# Compute SHAP values for Gradient Boosting Regressor
explainer_gbm = shap.TreeExplainer(gbm_reg)
shap_values_gbm = explainer_gbm.shap_values(X_train)
shap.summary_plot(shap_values_gbm, X_train)
_images/518a8c9e5d52a295524f2a712dd6f8982852e315bb2120c302e40a81ba7939fc.png
# For the Neural Network model using KernelExplainer
nn_background_data = shap.sample(X_train, 100)  # Randomly sample 100 instances from training data as background data
nn_explainer = shap.KernelExplainer(nn_reg.predict, nn_background_data)
nn_shap_values = nn_explainer.shap_values(X_test, nsamples=100)  # Using 100 Monte Carlo samples

shap.summary_plot(nn_shap_values, X_test)
_images/bc70683f26a67ccaff07f2bdf6a37d8c6c7e4c19ee4e94c0039170f5d38df4cb.png

Vertical Axis: Each row on the y-axis corresponds to a feature in the dataset. Features at the top are the most influential, with significance decreasing as you move downwards.

Horizontal Axis: Represents the SHAP value for each feature. The further a dot is from the center, the greater its impact on the prediction. Right of Center (positive values): Features that push the prediction higher. Left of Center (negative values): Features that push the prediction lower.

Color: The color intensity signifies the magnitude of the feature value. Red Dots: High feature values. For example, red dots on the right suggests that high values of this feature increase the assessment scores . Blue Dots: Low feature values. For example, blue dots on the right suggests that low values of this feature increase the assessment scores .

A mix of red and blue dots spread across both sides indicates that the feature can either increase or decrease the prediction, depending on its value and potential interactions with other features. For instance, both very high and very low values of a feature might increase the prediction, while mid-range values decrease it.