Hide code cell source
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, LinearAxis, Range1d, Span, Label, LabelSet, FactorRange, ColorBar, LinearColorMapper, BasicTicker
from bokeh.transform import dodge, factor_cmap
from bokeh.layouts import gridplot, column
from bokeh.palettes import Spectral8, Spectral6, Pastel1
from bokeh.transform import cumsum
output_notebook()

data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv', sep='\t', on_bad_lines='skip')
Loading BokehJS ...
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
/var/folders/0y/_j87n5jn4j7grzdsq8lvjk_80000gn/T/ipykernel_33356/2139025950.py in <cell line: 12>()
     10 output_notebook()
     11 
---> 12 data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv', sep='\t', on_bad_lines='skip')

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    910     kwds.update(kwds_defaults)
    911 
--> 912     return _read(filepath_or_buffer, kwds)
    913 
    914 

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    575 
    576     # Create the parser.
--> 577     parser = TextFileReader(filepath_or_buffer, **kwds)
    578 
    579     if chunksize or iterator:

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
   1405 
   1406         self.handles: IOHandles | None = None
-> 1407         self._engine = self._make_engine(f, self.engine)
   1408 
   1409     def close(self) -> None:

~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _make_engine(self, f, engine)
   1659                 if "b" not in mode:
   1660                     mode += "b"
-> 1661             self.handles = get_handle(
   1662                 f,
   1663                 mode,

~/Library/Python/3.9/lib/python/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    857         if ioargs.encoding and "b" not in ioargs.mode:
    858             # Encoding
--> 859             handle = open(
    860                 handle,
    861                 ioargs.mode,

FileNotFoundError: [Errno 2] No such file or directory: '/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv'

4.3 Assessment Analysis#

Hide code cell source
def calculate_correctness(df): 
    
    df = df.dropna(subset=['answer_correct_flag'])
    summary = df.groupby(['course_name', 'parent_level_type', 'child_level_type'])['answer_correct_flag'].value_counts().unstack().reset_index()
    summary.columns = ['course_name', 'parent_level_type', 'child_level_type', 'N', 'Y']
    summary['correctness_proportion'] = summary['Y'] / (summary['Y'] + summary['N'])
    
    # Calculate the overall average correctness proportion
    overall_avg = summary['correctness_proportion'].mean()
    
    return summary, overall_avg

summary_correctness, overall_avg = calculate_correctness(data)    

Assessment Performance#

The visualization illustrates the distribution of correct proportions for each of the three assessment types:

  • Multi: single-select multi choice

  • Multi2: choose-two multi-choice

  • Match: matching items

The x-axis represents the proportion of correct answers, while the y-axis represents the number of students. Each bar represents a unique combination of course name and child assessment type, color-coded based on the child assessment type. The dashed red line indicates the overall average correctness proportion across all courses and assessment types.

Hide code cell source
def plot_separate_bars(df, avg):

    df = df.sort_values(by=['child_level_type', 'course_name'])

    # Combine course_name, parent_level_type, and child_level_type into a single column
    df['course_parent_child'] = list(zip(df.course_name, df.parent_level_type, df.child_level_type))

    source = ColumnDataSource(df)

    factors = sorted(df['course_parent_child'].unique(), key=lambda x: (x[0], x[1], x[2]))

    p = figure(x_range=FactorRange(*factors), height=400, width=800, 
            title="Correctness by Course and Assessment Type", toolbar_location=None)

    p.vbar(x='course_parent_child', top='correctness_proportion', source=source, width=0.8, line_color="white",
        legend_field='child_level_type', fill_color=factor_cmap('child_level_type', palette=Spectral6, factors=df['child_level_type'].unique()))
    
    p.yaxis.axis_label = 'Correctness Proportion'

    # Add a horizontal line for the average correctness proportion 
    mean_line = Span(location=avg, dimension='width', line_color='red', line_dash='dashed', line_width=2)
    p.add_layout(mean_line)

    hover = HoverTool()
    hover.tooltips = [("Correctness Proportion", "@correctness_proportion"), 
                    ("Child Level Type", "@child_level_type")]
    p.add_tools(hover)

    p.x_range.range_padding = 0.1
    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = 1.2
    p.outline_line_color = None

    p.legend.title = "Assessment Type"
    p.add_layout(p.legend[0], 'right')
    p.legend.click_policy = "hide"

    show(p)
Hide code cell source
import matplotlib.pyplot as plt
order = ['Optimal', 'Pass (Not Optimal)', 'Failed', 'Special']

# Categorizing the test results based on the provided breakdown
def categorize_test_result(result):
    if result < 20 and result != -1:
        return 'Failed'
    elif 20 <= result < 30:
        return 'Pass (Not Optimal)'
    elif 30 <= result < 1000:
        return 'Optimal'
    else:
        return 'Special'


def plot_test_results_updated(data):
    data['test_result_category'] = data['best_result'].apply(categorize_test_result)

    test_result_counts = data['test_result_category'].value_counts()
    test_result_counts = test_result_counts[order]

    total_count = test_result_counts.sum()
    labels = [f"{label}\n{count} entries ({count/total_count:.1%})" for label, count in zip(test_result_counts.index, test_result_counts)]
    
    colors = Spectral8[1:len(test_result_counts)+1]
    explode = [0, 0, 0, 0.05]

    plt.figure(figsize=(8, 5))
    wedges, texts = plt.pie(test_result_counts, 
                            labels=labels, 
                            startangle=140,
                            colors=colors,
                            wedgeprops=dict(width=0.25),
                            textprops=dict(color="black"),
                            explode= explode
                            )

    plt.setp(texts, size=7)

    # Adding a legend
    plt.legend(wedges, test_result_counts.index,
              title="Test Results",
              loc="center left",
              bbox_to_anchor=(1, 0, 0.5, 1),fontsize=8)

    plt.title('Distribution of Test Results', fontsize=14)
    plt.tight_layout()
    plt.show()
plot_separate_bars(summary_correctness, overall_avg)

Insights

Diversity in Assessment Types: Across all courses, Match assessments generally have higher or comparable correctness proportions to the other types. This might indicate that students find these assessments easier or more intuitive.

Difficulty in Solving Choose-two Multi-choice Multi2 assessments consistently present a challenge, with correctness proportions often lower than the other types. This could be because students have to discern two correct answers, increasing the cognitive load and the chances of partial correctness (getting one answer right but not the other).

Rethinking Multi2 Assessments: Given the consistently lower correctness proportions for Multi2 across courses, educators might consider revisiting the design, frequency, or teaching approach for these assessments. Are they too challenging? Are students adequately prepared for them? Or is their format not conducive to the learning objectives of the course?

Leveraging Multi Assessments: The relatively consistent correctness proportions for Match assessments across courses suggest they are an effective assessment tool. They might be offering students a more engaging or intuitive way to demonstrate understanding. Teachers could consider using them more frequently or in critical areas of the curriculum.

Tailored Interventions: Recognizing these patterns can help educators tailor interventions. For instance, additional resources or practice exercises can be provided specifically for Multi2 type questions, especially in the “CS Discoveries (6-10)” LevelGroup Units.

Feedback Loop: It would be beneficial to gather feedback from students about their experience with these assessment types. Understanding students’ perspectives can offer insights into whether the questions were confusing, the content was not well-understood, or if the format of the questions (especially Multi2) was challenging.

In conclusion, the visualization offers a detailed look into student performance across different courses and assessment types. The insights drawn can guide curriculum designers, educators, and e-learning platforms in optimizing assessments for better learning outcomes.

Assessment Consistency#

The heatmap showcases the correctness proportions for various assessment types across different school years. Each cell’s color intensity represents the proportion of correct answers, with darker shades indicating lower correctness.

Hide code cell source
# Extract school year from script_name and convert answer_correct_flag to numeric
assess_data = data.copy()
assess_data['school_year'] = assess_data['script_name'].str[-4:]
assess_data['correct_flag_numeric'] = assess_data['answer_correct_flag'].map({'Y': 1, 'N': 0})

# Modify the parent_level_name to remove the last five characters and group by school_year and modified parent_level_name
assess_data['parent_level_name_modified'] = assess_data['parent_level_name'].str[:-5]
summary_correctness = assess_data.groupby(['school_year', 'parent_level_name_modified'])['correct_flag_numeric'].mean().reset_index()
summary_correctness.columns = ['course_year', 'parent_level_name', 'correctness_proportion']

# Compute average correctness proportion for each parent_level_name
avg_correctness = summary_correctness.groupby('parent_level_name')['correctness_proportion'].mean().sort_values()

# Filter out assessments with data for only one year
assessment_counts = summary_correctness['parent_level_name'].value_counts()
multi_year_assessments = assessment_counts[assessment_counts > 1].index

# Reorder the rows in summary_correctness based on sorted order of avg_correctness
summary_correctness['parent_level_name'] = pd.Categorical(summary_correctness['parent_level_name'], categories=avg_correctness.index, ordered=True)
filtered_summary = summary_correctness[summary_correctness['parent_level_name'].isin(multi_year_assessments)]

# Function to create heatmap
def create_heatmap(df):
    df = df.reset_index()
    color_mapper = LinearColorMapper(palette="Viridis256", low=df['correctness_proportion'].min(), high=df['correctness_proportion'].max())
    p = figure(x_range=list(map(str, df['course_year'].unique())), 
               y_range=sorted(list(map(str, df['parent_level_name'].unique())),reverse = True),
               x_axis_location="above", width=800, height=400, tools="hover", toolbar_location=None, 
               tooltips=[('Year', '@course_year'), ('Assignment', '@parent_level_name'), ('Correctness Proportion', '@correctness_proportion')])
    p.rect(x="course_year", y="parent_level_name", width=1, height=1, source=df, fill_color={'field': 'correctness_proportion', 'transform': color_mapper}, line_color=None)
    color_bar = ColorBar(color_mapper=color_mapper, location=(0, 0), ticker=BasicTicker())
    p.add_layout(color_bar, 'right')
    show(p)
create_heatmap(filtered_summary)
import matplotlib.pyplot as plt
order = ['Optimal', 'Pass (Not Optimal)', 'Failed', 'Special']

# Categorizing the test results based on the provided breakdown
def categorize_test_result(result):
    if result < 20 and result != -1:
        return 'Failed'
    elif 20 <= result < 30:
        return 'Pass (Not Optimal)'
    elif 30 <= result < 1000:
        return 'Optimal'
    else:
        return 'Special'


def plot_test_results_updated(data):
    data['test_result_category'] = data['best_result'].apply(categorize_test_result)

    test_result_counts = data['test_result_category'].value_counts()
    test_result_counts = test_result_counts[order]

    total_count = test_result_counts.sum()
    labels = [f"{label}\n{count} entries ({count/total_count:.1%})" for label, count in zip(test_result_counts.index, test_result_counts)]
    
    colors = Spectral8[1:len(test_result_counts)+1]
    explode = [0, 0, 0, 0.05]

    plt.figure(figsize=(8, 5))
    wedges, texts = plt.pie(test_result_counts, 
                            labels=labels, 
                            startangle=140,
                            colors=colors,
                            wedgeprops=dict(width=0.25),
                            textprops=dict(color="black"),
                            explode= explode
                            )

    plt.setp(texts, size=7)

    # Adding a legend
    plt.legend(wedges, test_result_counts.index,
              title="Test Results",
              loc="center left",
              bbox_to_anchor=(1, 0, 0.5, 1),fontsize=8)

    plt.title('Distribution of Test Results', fontsize=14)
    plt.tight_layout()
    plt.show()

From the heatmap, we can observe:

Assessment Consistency: By examining the horizontal patterns in the heatmap, we can identify most assessments have maintained consistent performance levels over the years.

Areas of Attention: Lighter cells indicate high correctness proportions, suggesting that students found those assessments or years relatively easier. On the other hand, darker shades might indicate more challenging assessments or years where students struggled, signaling areas that might benefit from revisited teaching strategies or curriculum adjustments.