Hide code cell source
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from bokeh.palettes import Spectral8
from bokeh.models import ColumnDataSource, Whisker, LabelSet
from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap
from bokeh.palettes import Viridis3
from bokeh.io import output_notebook, show
from bokeh.layouts import gridplot
from scipy.stats import kruskal
output_notebook()

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity.tsv', sep='\t', on_bad_lines='skip')
data_sorted = data.dropna(subset=['teacher_user_id', 'school_year', 'script_id', 'student_user_id']).sort_values(by=['teacher_user_id', 'school_year'])
Loading BokehJS ...
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/var/folders/0y/_j87n5jn4j7grzdsq8lvjk_80000gn/T/ipykernel_33342/1520486975.py in <cell line: 19>()
     17 warnings.filterwarnings('ignore')
     18 
---> 19 data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity.tsv', sep='\t', on_bad_lines='skip')
     20 data_sorted = data.dropna(subset=['teacher_user_id', 'school_year', 'script_id', 'student_user_id']).sort_values(by=['teacher_user_id', 'school_year'])

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    910     kwds.update(kwds_defaults)
    911 
--> 912     return _read(filepath_or_buffer, kwds)
    913 
    914 

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    575 
    576     # Create the parser.
--> 577     parser = TextFileReader(filepath_or_buffer, **kwds)
    578 
    579     if chunksize or iterator:

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
   1405 
   1406         self.handles: IOHandles | None = None
-> 1407         self._engine = self._make_engine(f, self.engine)
   1408 
   1409     def close(self) -> None:

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _make_engine(self, f, engine)
   1659                 if "b" not in mode:
   1660                     mode += "b"
-> 1661             self.handles = get_handle(
   1662                 f,
   1663                 mode,

~/Library/Python/3.9/lib/python/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    857         if ioargs.encoding and "b" not in ioargs.mode:
    858             # Encoding
--> 859             handle = open(
    860                 handle,
    861                 ioargs.mode,

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_student_activity.tsv'

5.1 Analysis of Teacher Platform Implementation#

This section gives a detailed view of teachers’ engagement and implementation of instructional activities in code.org. Through this analysis, we aimed to segment teachers based on their levels of platform implementation, which was gauged through metrics like the number of unique scripts they assigned and the number of students they taught. The resulting cluster membership could denote how actively teachers are involved with the platform, both in terms of the variety (unique scripts) and reach (unique students).

Teacher Implementation Clusters#

this compute_cumulative_unique function calculate

  • The cumulative number of unique scripts assigned up to and including that school year.

  • The cumulative number of unique students taught up to and including that school year.

Hide code cell source
cumulative_data = data_sorted.groupby(['teacher_user_id', 'school_year']).agg({
    'script_id': 'unique',
    'student_user_id': 'unique'
}).reset_index()

# Define a function to compute cumulative unique values for a series
def compute_cumulative_unique(series):
    cumulative_list = []
    seen = set()
    for s in series:
        seen.update(s)
        cumulative_list.append(len(seen))
    return cumulative_list

# Apply the function to get cumulative unique scripts and students for each teacher
cumulative_data['cumulative_scripts'] = cumulative_data.groupby('teacher_user_id')['script_id'].transform(compute_cumulative_unique)
cumulative_data['cumulative_students'] = cumulative_data.groupby('teacher_user_id')['student_user_id'].transform(compute_cumulative_unique)

# Drop the temporary columns
cumulative_data = cumulative_data.drop(columns=['script_id', 'student_user_id'])

Before applying clustering, we need to apply the Elbow method to find the optimal number of clusters.

The Elbow method is used to determine the optimal number of clusters by fitting the model with a range of cluster values and then plotting the resulting within-cluster sum of squares (WCSS) against the number of clusters.

# Scaling the cumulative metrics
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cumulative_data[['cumulative_scripts', 'cumulative_students']])

# Applying the Elbow method
wcss = []
cluster_range = range(1, 11)

for i in cluster_range:
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Plotting the Elbow curve
plt.figure(figsize=(9, 6))
plt.plot(cluster_range, wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid(True)
plt.show()
_images/9333f32cc31afea8cf805b7d186257e38486e2a6fc8bed742275547c685229e4.png

It’s based on the point where the rate of decrease of the WCSS starts to slow down, indicating diminishing returns in terms of the tightness of clusters.

Hide code cell source
# Applying KMeans clustering with 3 clusters
kmeans = KMeans(n_clusters=3, init='k-means++', random_state=42)
cumulative_data['cluster'] = kmeans.fit_predict(scaled_data)

plt.figure(figsize=(9, 6))

colors = Spectral8[0:3]
for cluster, color in zip(range(3), colors):
    subset = cumulative_data[cumulative_data['cluster'] == cluster]
    plt.scatter(subset['cumulative_scripts'], subset['cumulative_students'], s=100, c=color, label=f'Cluster {cluster}')

plt.title('Implementation Clusters Based on Cumulative Metrics')
plt.xlabel('Cumulative Number of Unique Scripts')
plt.ylabel('Cumulative Number of Unique Students')
plt.legend()
plt.grid(True)
plt.show()
_images/e4c085548fc8ad5ac7a24f7ed2db4d4461996390b6cfd20fd0d12f32f5092506.png

Teachers were segmented into three distinct clusters:

Low Implementation: Teachers who, up to a certain year, had assigned fewer unique scripts and engaged with fewer unique students cumulatively.

Medium Implementation: Teachers who had a moderate cumulative count of unique script assignments and student engagements up to that year.

High Implementation: The most active teachers who, cumulatively, assigned a large number of unique scripts and reached many unique students up to that year.

Kruskal-Wallis H-test Results#

For each implementation metric , a Kruskal-Wallis H-test was conducted to test for statistical differences between implementation clusters.

Hide code cell source
cumulative_data['cluster_str'] = cumulative_data['cluster'].astype(str)
implementation_labels = { '0': 'Low Implementation', '2': 'Medium Implementation', '1': 'High Implementation'}
cumulative_data['cluster_str'] = cumulative_data['cluster_str'].map(implementation_labels)
clusters = ['Low Implementation', 'Medium Implementation', 'High Implementation']

plots = []
for aspect in ['cumulative_scripts', 'cumulative_students']:
    # Calculate the Kruskal-Wallis test results
    groups = [group[aspect].dropna() for name, group in cumulative_data.groupby('cluster_str')]
    H_result, p_result = kruskal(*groups)  

    # Calculate the eta-squared statistic (effect size)
    n = sum(len(group) for group in groups)
    eta_squared = H_result / (n - 1)

    # Compute quantiles
    qs = cumulative_data.groupby('cluster_str')[aspect].quantile([0.25, 0.5, 0.75])
    qs = qs.unstack().reset_index()
    qs.columns = ['cluster_str', 'q1', 'q2', 'q3']
    df = pd.merge(cumulative_data, qs, on='cluster_str', how='left')

    # Compute IQR outlier bounds
    iqr = df.q3 - df.q1
    df['upper'] = df.q3 + 1.5*iqr
    df['lower'] = df.q1 - 1.5*iqr

    source = ColumnDataSource(df)

    p = figure(x_range=clusters, y_range=(0, cumulative_data[aspect].max()*1.1), width=380, height=450, y_axis_label=f'{aspect} by cluster', tools="", toolbar_location=None, title=f"Distribution of {aspect} by cluster")

    # quantile boxes
    cmap = factor_cmap('cluster_str', palette=Spectral8[0:3], factors=clusters)
    p.vbar('cluster_str', 0.7, 'q2', 'q3', source=source, color=cmap, line_color='black')
    p.vbar('cluster_str', 0.7, 'q1', 'q2', source=source, color=cmap, line_color='black')

    # outliers
    whisker = Whisker(base='cluster_str', upper='upper', lower='lower', source=source)
    whisker.upper_head.size = whisker.lower_head.size = 6
    p.add_layout(whisker)
    outliers = df[~df[aspect].between(df.lower, df.upper)]
    p.circle('cluster_str', aspect, source=outliers, size=6, color='black', alpha=0.3)

    p.xgrid.grid_line_color = None
    p.axis.major_label_text_font_size="10px"
    p.axis.axis_label_text_font_size="9px"

    source = ColumnDataSource(data=dict(
        x=['Low Implementation'],
        y=[cumulative_data[aspect].max() * 1.05], 
        text=[f"Kruskal-Wallis H = {H_result:.2f}, p = {p_result:.2f}, η² = {eta_squared:.2f}"],
    ))
    labels = LabelSet(x='x', y='y', text='text', level='glyph',
                      x_offset=-15, y_offset=-5, source=source, text_font_size="10pt")
    p.add_layout(labels)

    plots.append(p)

At the top of each plot, the results of the Kruskal-Wallis H-test are displayed. This test is a non-parametric method used to determine if there are statistically significant differences between two or more groups. The p-value gives the probability that the differences observed could have occurred by random chance. The η2 value is the effect size, which quantifies the magnitude of the observed effect.

grid = gridplot([plots])
show(grid, notebook_handle=True);
  • The cumulative unique number of scripts and cumulative unique number of students metrics showed clear distinctions between the three Implementation clusters. This indicates that teachers’ cumulative implementation with the platform can be effectively gauged by considering how many unique scripts they’ve assigned and how many unique students they’ve reached over the years.

  • The High Implementation cluster, cumulatively, not only assigns more unique scripts but also reaches a broader unique student audience up to that year. These teachers seem to be consistently using the platform across school years and integrating it deeply into their curricula.

  • The statistical tests confirmed that the Implementation differences between the clusters are indeed significant.