Show code cell source
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, LinearAxis, Range1d, Span, Label, LabelSet, FactorRange, ColorBar, LinearColorMapper, BasicTicker
from bokeh.transform import dodge, factor_cmap
from bokeh.layouts import gridplot, column
from bokeh.palettes import Spectral8, Spectral6, Pastel1
from bokeh.transform import cumsum
output_notebook()
data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv', sep='\t', on_bad_lines='skip')
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
/var/folders/0y/_j87n5jn4j7grzdsq8lvjk_80000gn/T/ipykernel_33356/2139025950.py in <cell line: 12>()
10 output_notebook()
11
---> 12 data = pd.read_csv('/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv', sep='\t', on_bad_lines='skip')
~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
910 kwds.update(kwds_defaults)
911
--> 912 return _read(filepath_or_buffer, kwds)
913
914
~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
575
576 # Create the parser.
--> 577 parser = TextFileReader(filepath_or_buffer, **kwds)
578
579 if chunksize or iterator:
~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
1405
1406 self.handles: IOHandles | None = None
-> 1407 self._engine = self._make_engine(f, self.engine)
1408
1409 def close(self) -> None:
~/Library/Python/3.9/lib/python/site-packages/pandas/io/parsers/readers.py in _make_engine(self, f, engine)
1659 if "b" not in mode:
1660 mode += "b"
-> 1661 self.handles = get_handle(
1662 f,
1663 mode,
~/Library/Python/3.9/lib/python/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
857 if ioargs.encoding and "b" not in ioargs.mode:
858 # Encoding
--> 859 handle = open(
860 handle,
861 ioargs.mode,
FileNotFoundError: [Errno 2] No such file or directory: '/Users/rachel/Library/CloudStorage/Dropbox-TLPSummerInterns/TLP Summer Intern Folder/Zhou/CODE - MPS_data_july_2023/mps_assessments.tsv'
4.3 Assessment Analysis#
Show code cell source
def calculate_correctness(df):
df = df.dropna(subset=['answer_correct_flag'])
summary = df.groupby(['course_name', 'parent_level_type', 'child_level_type'])['answer_correct_flag'].value_counts().unstack().reset_index()
summary.columns = ['course_name', 'parent_level_type', 'child_level_type', 'N', 'Y']
summary['correctness_proportion'] = summary['Y'] / (summary['Y'] + summary['N'])
# Calculate the overall average correctness proportion
overall_avg = summary['correctness_proportion'].mean()
return summary, overall_avg
summary_correctness, overall_avg = calculate_correctness(data)
Assessment Performance#
The visualization illustrates the distribution of correct proportions for each of the three assessment types:
Multi: single-select multi choice
Multi2: choose-two multi-choice
Match: matching items
The x-axis represents the proportion of correct answers, while the y-axis represents the number of students. Each bar represents a unique combination of course name and child assessment type, color-coded based on the child assessment type. The dashed red line indicates the overall average correctness proportion across all courses and assessment types.
Show code cell source
def plot_separate_bars(df, avg):
df = df.sort_values(by=['child_level_type', 'course_name'])
# Combine course_name, parent_level_type, and child_level_type into a single column
df['course_parent_child'] = list(zip(df.course_name, df.parent_level_type, df.child_level_type))
source = ColumnDataSource(df)
factors = sorted(df['course_parent_child'].unique(), key=lambda x: (x[0], x[1], x[2]))
p = figure(x_range=FactorRange(*factors), height=400, width=800,
title="Correctness by Course and Assessment Type", toolbar_location=None)
p.vbar(x='course_parent_child', top='correctness_proportion', source=source, width=0.8, line_color="white",
legend_field='child_level_type', fill_color=factor_cmap('child_level_type', palette=Spectral6, factors=df['child_level_type'].unique()))
p.yaxis.axis_label = 'Correctness Proportion'
# Add a horizontal line for the average correctness proportion
mean_line = Span(location=avg, dimension='width', line_color='red', line_dash='dashed', line_width=2)
p.add_layout(mean_line)
hover = HoverTool()
hover.tooltips = [("Correctness Proportion", "@correctness_proportion"),
("Child Level Type", "@child_level_type")]
p.add_tools(hover)
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.xaxis.major_label_orientation = 1.2
p.outline_line_color = None
p.legend.title = "Assessment Type"
p.add_layout(p.legend[0], 'right')
p.legend.click_policy = "hide"
show(p)
Show code cell source
import matplotlib.pyplot as plt
order = ['Optimal', 'Pass (Not Optimal)', 'Failed', 'Special']
# Categorizing the test results based on the provided breakdown
def categorize_test_result(result):
if result < 20 and result != -1:
return 'Failed'
elif 20 <= result < 30:
return 'Pass (Not Optimal)'
elif 30 <= result < 1000:
return 'Optimal'
else:
return 'Special'
def plot_test_results_updated(data):
data['test_result_category'] = data['best_result'].apply(categorize_test_result)
test_result_counts = data['test_result_category'].value_counts()
test_result_counts = test_result_counts[order]
total_count = test_result_counts.sum()
labels = [f"{label}\n{count} entries ({count/total_count:.1%})" for label, count in zip(test_result_counts.index, test_result_counts)]
colors = Spectral8[1:len(test_result_counts)+1]
explode = [0, 0, 0, 0.05]
plt.figure(figsize=(8, 5))
wedges, texts = plt.pie(test_result_counts,
labels=labels,
startangle=140,
colors=colors,
wedgeprops=dict(width=0.25),
textprops=dict(color="black"),
explode= explode
)
plt.setp(texts, size=7)
# Adding a legend
plt.legend(wedges, test_result_counts.index,
title="Test Results",
loc="center left",
bbox_to_anchor=(1, 0, 0.5, 1),fontsize=8)
plt.title('Distribution of Test Results', fontsize=14)
plt.tight_layout()
plt.show()
plot_separate_bars(summary_correctness, overall_avg)
Insights
Diversity in Assessment Types: Across all courses, Match assessments generally have higher or comparable correctness proportions to the other types. This might indicate that students find these assessments easier or more intuitive.
Difficulty in Solving Choose-two Multi-choice Multi2 assessments consistently present a challenge, with correctness proportions often lower than the other types. This could be because students have to discern two correct answers, increasing the cognitive load and the chances of partial correctness (getting one answer right but not the other).
Rethinking Multi2 Assessments: Given the consistently lower correctness proportions for Multi2 across courses, educators might consider revisiting the design, frequency, or teaching approach for these assessments. Are they too challenging? Are students adequately prepared for them? Or is their format not conducive to the learning objectives of the course?
Leveraging Multi Assessments: The relatively consistent correctness proportions for Match assessments across courses suggest they are an effective assessment tool. They might be offering students a more engaging or intuitive way to demonstrate understanding. Teachers could consider using them more frequently or in critical areas of the curriculum.
Tailored Interventions: Recognizing these patterns can help educators tailor interventions. For instance, additional resources or practice exercises can be provided specifically for Multi2 type questions, especially in the “CS Discoveries (6-10)” LevelGroup Units.
Feedback Loop: It would be beneficial to gather feedback from students about their experience with these assessment types. Understanding students’ perspectives can offer insights into whether the questions were confusing, the content was not well-understood, or if the format of the questions (especially Multi2) was challenging.
In conclusion, the visualization offers a detailed look into student performance across different courses and assessment types. The insights drawn can guide curriculum designers, educators, and e-learning platforms in optimizing assessments for better learning outcomes.
Assessment Consistency#
The heatmap showcases the correctness proportions for various assessment types across different school years. Each cell’s color intensity represents the proportion of correct answers, with darker shades indicating lower correctness.
Show code cell source
# Extract school year from script_name and convert answer_correct_flag to numeric
assess_data = data.copy()
assess_data['school_year'] = assess_data['script_name'].str[-4:]
assess_data['correct_flag_numeric'] = assess_data['answer_correct_flag'].map({'Y': 1, 'N': 0})
# Modify the parent_level_name to remove the last five characters and group by school_year and modified parent_level_name
assess_data['parent_level_name_modified'] = assess_data['parent_level_name'].str[:-5]
summary_correctness = assess_data.groupby(['school_year', 'parent_level_name_modified'])['correct_flag_numeric'].mean().reset_index()
summary_correctness.columns = ['course_year', 'parent_level_name', 'correctness_proportion']
# Compute average correctness proportion for each parent_level_name
avg_correctness = summary_correctness.groupby('parent_level_name')['correctness_proportion'].mean().sort_values()
# Filter out assessments with data for only one year
assessment_counts = summary_correctness['parent_level_name'].value_counts()
multi_year_assessments = assessment_counts[assessment_counts > 1].index
# Reorder the rows in summary_correctness based on sorted order of avg_correctness
summary_correctness['parent_level_name'] = pd.Categorical(summary_correctness['parent_level_name'], categories=avg_correctness.index, ordered=True)
filtered_summary = summary_correctness[summary_correctness['parent_level_name'].isin(multi_year_assessments)]
# Function to create heatmap
def create_heatmap(df):
df = df.reset_index()
color_mapper = LinearColorMapper(palette="Viridis256", low=df['correctness_proportion'].min(), high=df['correctness_proportion'].max())
p = figure(x_range=list(map(str, df['course_year'].unique())),
y_range=sorted(list(map(str, df['parent_level_name'].unique())),reverse = True),
x_axis_location="above", width=800, height=400, tools="hover", toolbar_location=None,
tooltips=[('Year', '@course_year'), ('Assignment', '@parent_level_name'), ('Correctness Proportion', '@correctness_proportion')])
p.rect(x="course_year", y="parent_level_name", width=1, height=1, source=df, fill_color={'field': 'correctness_proportion', 'transform': color_mapper}, line_color=None)
color_bar = ColorBar(color_mapper=color_mapper, location=(0, 0), ticker=BasicTicker())
p.add_layout(color_bar, 'right')
show(p)
create_heatmap(filtered_summary)
import matplotlib.pyplot as plt
order = ['Optimal', 'Pass (Not Optimal)', 'Failed', 'Special']
# Categorizing the test results based on the provided breakdown
def categorize_test_result(result):
if result < 20 and result != -1:
return 'Failed'
elif 20 <= result < 30:
return 'Pass (Not Optimal)'
elif 30 <= result < 1000:
return 'Optimal'
else:
return 'Special'
def plot_test_results_updated(data):
data['test_result_category'] = data['best_result'].apply(categorize_test_result)
test_result_counts = data['test_result_category'].value_counts()
test_result_counts = test_result_counts[order]
total_count = test_result_counts.sum()
labels = [f"{label}\n{count} entries ({count/total_count:.1%})" for label, count in zip(test_result_counts.index, test_result_counts)]
colors = Spectral8[1:len(test_result_counts)+1]
explode = [0, 0, 0, 0.05]
plt.figure(figsize=(8, 5))
wedges, texts = plt.pie(test_result_counts,
labels=labels,
startangle=140,
colors=colors,
wedgeprops=dict(width=0.25),
textprops=dict(color="black"),
explode= explode
)
plt.setp(texts, size=7)
# Adding a legend
plt.legend(wedges, test_result_counts.index,
title="Test Results",
loc="center left",
bbox_to_anchor=(1, 0, 0.5, 1),fontsize=8)
plt.title('Distribution of Test Results', fontsize=14)
plt.tight_layout()
plt.show()
From the heatmap, we can observe:
Assessment Consistency: By examining the horizontal patterns in the heatmap, we can identify most assessments have maintained consistent performance levels over the years.
Areas of Attention: Lighter cells indicate high correctness proportions, suggesting that students found those assessments or years relatively easier. On the other hand, darker shades might indicate more challenging assessments or years where students struggled, signaling areas that might benefit from revisited teaching strategies or curriculum adjustments.