Introduction
We were discussing in our department the pre-requisites for different courses in computer science, and I started wondering about the "normal progression" of students in our Computer Science bachellor's degree. We have a suggested 4 year curriculum, but hardly any students graduate in 4 years. I wanted to see if some visualizations could show any bottlenecks or problem areas in our program.
Methods
I obtained a database of course attempts by all computer science students, and the list of suggested courses by semester for students arriving with pre-calculus and those arriving without pre-calculus.
I prepared a jupyter notebook with the analysis and plots.
Results
See the notebook embedded below.
import pandas as pd
students = pd.read_csv("cursostomados.csv", 
                           dtype='str')
students.iloc[[1,200,54,99],1:]
| Curso | Sesion | Nota | |
|---|---|---|---|
| 1 | CISO3122 | 921 | C | 
| 200 | CCOM3033 | B31 | B | 
| 54 | EDFU3002 | 991 | A | 
| 99 | HUMA3101 | A91 | B | 
def makesequences(table):
    sequences = {}
    
    for index, Numest, Curso, Sesion, Nota in students.itertuples():
        if Numest not in sequences:
            sequences[Numest] = [(Curso, Sesion, Nota)]
        else:
            sequences[Numest].append((Curso, Sesion, Nota))
    
    return sequences
def codetonum(code):
    decade = code[0] 
    year = code[1]
    semester = code[2]
    time = 1900 + int(decade, base=16) * 10 + int(year) + 0.5 * (int(semester) - 1)
    return time
sequences = makesequences(students)
len(sequences.keys())
476
prog2007 = ['MATE3018', 'MATE3023', 'MATE3024', # should have tested out of these pre calc
            'CCOM3033', 'MATE3151', 'CCOM3034', 'MATE3152',
            'CCOM4016', 'CCOM4017', 'CCOM4027', 'MATE3153',
            'CCOM4086', 'CCOM3029', 'MATE4032',
            'CCOM5050', 'MATE4080',
            'CCOM4087', 'CCOM5035'
           ]
prog2011 = ['MATE3018', 'MATE3023', 'MATE3024', # should have tested out of these pre calc
            'MATE3151', 'CCOM3030', 'CCOM3981', # espa soc eng
            'CCOM3020', 'CCOM3033', 'CCOM3982', # soc espa eng
            'MATE3152', 'CCOM3034', 'CCOM4016', 'CCOM4086', # sci huma
            'MATE4081', 'MATE4032', 'CCOM4027', 'CCOM4029', # sci huma
            'MATE4080', 'CCOM4017', 'CCOM4030', # libre lite
            'CCOM5050', 'CCOM4065', 'CCOM4205', # art lite
            'MATE5001', 'CCOM5035', 'CCOM4995', # SCI SCI
            'CCOM4996', 'CCOM4087']
prog2016 = ['MATE3018', 'MATE3023', 'MATE3024', # should have tested out of these pre calc
            'MATE3151', 'CCOM3030', 'CCOM3981', # espa soc eng
            'CCOM3020', 'CCOM3033', 'CCOM3982', # soc espa eng
            'MATE3152', 'CCOM3034', 'CCOM4086', # sci huma
            'MATE4081', 'MATE4032', 'CCOM4027', 'CCOM4029', # sci huma
            'MATE4080', 'CCOM4017', 'CCOM4030', # libre lite
            'CCOM5050', 'CCOM4065', 'CCOM4205', # art lite
            'MATE5001', 'CCOM5035', 'CCOM4995', # SCI SCI
            'CCOM4996', 'CCOM4087']
def clean(students, core):
    "Filter students, return only core courses"
    clean = {}
    for student, courselist in students.items():
        cleanlist = []
        for course, sem, grade in courselist:
            if course in core:
                cleanlist.append((course, sem, grade))
        if len(cleanlist) > 0:
            clean[student] = cleanlist
    return clean
newstu = clean(sequences, prog2011)
from collections import defaultdict
def getseq(students):
    freqs = defaultdict(int)
    for student in students.keys():
        for i in range(len(students[student])):
            course, elapsed, grade = students[student][i]
            freqs[course] += i
    return list(map(lambda item: item[0], sorted(freqs.items(), key=lambda item: (item[1], item[0]))))
cs_seq = getseq(newstu)
cs_seq
['CCOM4065', 'MATE3018', 'CCOM4205', 'CCOM3030', 'CCOM4016', 'CCOM4996', 'MATE3023', 'CCOM3981', 'MATE3024', 'MATE4081', 'CCOM3982', 'CCOM4029', 'MATE5001', 'CCOM4030', 'CCOM3020', 'CCOM3033', 'CCOM3034', 'MATE3151', 'CCOM4086', 'CCOM4027', 'CCOM4017', 'MATE4032', 'CCOM4995', 'CCOM5035', 'MATE3152', 'MATE4080', 'CCOM4087', 'CCOM5050']
import numpy as np
rows = len(cs_seq)
counts = np.zeros(rows * rows).reshape(rows,rows)
for stu in newstu:
    courses = [course for course, sem, grade in newstu[stu] if course in cs_seq]
    courses = list(dict.fromkeys(courses)) # unique, preserving order, first occurrence
    for i in range(len(courses)):
        row = cs_seq.index(courses[i])
        counts[row,i] += 1
counts = pd.DataFrame(counts, index=cs_seq)
%matplotlib inline
import seaborn as sns; sns.set()
ax = sns.heatmap(counts)
 
def heatmap(students, cs_seq):
    rows = len(cs_seq)
    cols = 17
    counts = np.zeros(rows * cols).reshape(rows,cols)
    counts = pd.DataFrame(counts, index=cs_seq)
    for stu in students:
        courses = [(course, sem, grade) for course, sem, grade in students[stu] if course in cs_seq]
        #print(courses)
        courses = semesters(courses)
        for i in range(len(courses)):
            course, sem, grade = courses[i]
            #print(course)
            row = cs_seq.index(course)
            if sem >= cols:
                sem = cols-1
            counts.iat[row,sem] += 1
    return counts
def failmap(students, cs_seq):
    rows = len(cs_seq)
    cols = 17
    counts = np.zeros(rows * cols).reshape(rows,cols)
    counts = pd.DataFrame(counts, index=cs_seq)
    for stu in students:
        courses = [(course, sem, grade) for course, sem, grade in students[stu] if course in cs_seq]
        #print(courses)
        courses = semesters(courses)
        for i in range(len(courses)):
            course, sem, grade = courses[i]
            #print(course)
            row = cs_seq.index(course)
            if sem >= cols:
                sem = cols-1
            if grade in ["F", "F*", "NP", "NP*"]:
                counts.iat[row,sem] += 1
    return counts
 
def codetosem(code):
    decade = code[0] 
    year = code[1]
    semester = code[2]
    time = 1900 + int(decade, base=16) * 10 + int(year) + 0.5 * (int(semester) - 1)
    return time
def semesters(courses):
    "Read a list of course, semester tuples and write out relative timeline"
    course, start, grade = courses[0]
    first_sem = codetosem(start)
    last_sem = 0
    sequence = []
    for course, semester, grade in courses:
        now = codetonum(semester)
        
        elapsed = now-first_sem
        elapsed *= 2
        sequence.append((course, int(elapsed), grade))
    return sequence
ax = sns.heatmap(heatmap(clean(sequences, prog2016), prog2016),cmap = None)
ax = sns.heatmap(heatmap(clean(sequences, prog2011), prog2011),cmap = None)
CCOM4016 was a pre-requisite for CCOM4086 until 2011. What is now MATE4081 was MATE4032 until two years ago.
Find students that took precalc in UPR
def took(courses, course):
    "Return true if student took a given course"
    return course in list(map(lambda row: row[0], courses))
precalc_stu = {k: v for k, v in sequences.items() if (took(v, 'MATE3018') or took(v, 'MATE3024'))}
len(dict(precalc_stu))
318
precalc_failcounts = failmap(precalc_stu, prog2016)
ax = sns.heatmap(heatmap(clean(precalc_stu, prog2016), prog2016),cmap = None, annot=precalc_failcounts)
ax.set(title="Students taking precalculus")
[<matplotlib.text.Text at 0x1181c6860>]
The above plot is shaded by the number of students taking the course indicated in the row at the semester indicated in the column. Numbers in cells is the number of students failing the course.
calc_stu = {k: v for k, v in sequences.items() if (not took(v, 'MATE3018') and not took(v, "MATE3024"))}
len(calc_stu)
158
318+158 # sanity check, do I still have all the students?
476
calc_failcount = failmap(clean(calc_stu, prog2016[3:]), prog2016[3:])
ax = sns.heatmap(heatmap(clean(calc_stu, prog2016[3:]), prog2016[3:]), cmap = None, annot=calc_failcount)
ax.set(title="Students starting with calculus")
[<matplotlib.text.Text at 0x1181c2128>]
The above plot is shaded by the number of students taking the course indicated in the row at the semester indicated in the column. Numbers in cells is the number of students failing the course.
ax = sns.heatmap(precalc_failcounts,cmap = "inferno", annot=precalc_failcounts)
ax.set(title="Course failures for students starting with precalculus")
[<matplotlib.text.Text at 0x1190f1198>]
ax = sns.heatmap(calc_failcount, cmap = "inferno", annot=calc_failcount)
ax.set(title="Course failures for students starting with calculus")
[<matplotlib.text.Text at 0x119808d30>]
Cohorts¶
Patti sugested looking at cohorts based on when they took CCOM3030.
intro = students.loc[students["Curso"] == "CCOM3030"]
intro.iloc[0:5,1:3]
| Curso | Sesion | |
|---|---|---|
| 92 | CCOM3030 | A71 | 
| 102 | CCOM3030 | A92 | 
| 142 | CCOM3030 | B32 | 
| 171 | CCOM3030 | B22 | 
| 175 | CCOM3030 | B31 | 
cohorts = {}
for row in intro.itertuples():
    #print (row)
    index, Numest, Curso, Sesion, Nota = row
    if Sesion not in cohorts:
        cohorts[Sesion] = [Numest]
    else:
        cohorts[Sesion].append(Numest)
cohort_keys = sorted(cohorts.keys())
Can plot heatmap for each cohort.
heatmaps = []
failmaps = []
for semester in cohort_keys:
    ids = cohorts[semester]
    cohort = {k : sequences[k] for k in ids}
    heatmaps.append(heatmap(cohort, prog2016))
    failmaps.append(failmap(cohort, prog2016))
import matplotlib.pyplot as plt
for i in range(15): # just the first 7 years
    ax = sns.heatmap(heatmaps[i], annot=failmaps[i])
    ax.set(title=cohort_keys[i])
    plt.show()
Cohorts (pre 2007, post 2007)¶
Carlos suggested examining progression before CCOM3030 (implemented in 2007 academic year) and after.
caocohorts = defaultdict(list)
known = []
for row in students.itertuples():
    #print (row)
    index, Numest, Curso, Sesion, Nota = row
    if Numest not in known:
        caocohorts[Sesion].append(Numest)
        known.append(Numest)
cao_keys = sorted(caocohorts.keys())
# find all students before 2007
list(filter(lambda semester: semester < "A71", cao_keys))
['821', '921', '961', '971', '981', '991', 'A01', 'A11', 'A12', 'A21', 'A31', 'A41', 'A51', 'A52', 'A61', 'A62']
pre_stu = []
for sem in list(filter(lambda semester: semester < "A71", cao_keys)):
    pre_stu += caocohorts[sem]
cohort = {k:sequences[k] for k in pre_stu}
ax = sns.heatmap(heatmap(clean(cohort,prog2007), prog2007), annot=failmap(clean(cohort, prog2007), prog2007))
ax.set(title="Pre 2007")
plt.show()
 
post_stu = []
for sem in list(filter(lambda semester: semester >= "A71", cao_keys)):
    post_stu += caocohorts[sem]
 
post_cohort = {k:sequences[k] for k in post_stu}
ax = sns.heatmap(heatmap(clean(post_cohort,prog2011), prog2011), annot=failmap(clean(post_cohort, prog2011), prog2011))
ax.set(title="Post 2007")
plt.show()
Discrete math¶
Ive asks how student's grade in discrete math affects their progress.
def got(courses, course, grades):
    "Return true if student took a given course and obtained one of the given grades on the first try"
    gotit = False
    for row in courses:
        code, section, grade = row
        if code == course:
            if str(grade) in grades:  # there's a 'nan' value in the grades!
                gotit = True
            break # don't count any more attempts on this course
    return gotit
discrete_stu = {k: v for k, v in sequences.items() if took(v, 'CCOM3020')}
len(discrete_stu)
200
aorb_discrete_stu = {k: v for k, v in discrete_stu.items() if got(v, 'CCOM3020', "AB")}
len(aorb_discrete_stu)
61
ctof_discrete_stu = {k: v for k, v in discrete_stu.items() if not got(v, 'CCOM3020', "AB")}
len(ctof_discrete_stu)
139
ax = sns.heatmap(heatmap(clean(aorb_discrete_stu, prog2016), prog2016),cmap = None)
ax = sns.heatmap(heatmap(clean(ctof_discrete_stu, prog2016), prog2016),cmap = None)
Students that got a or b in discrete failing any course
aorb_failcounts = failmap(clean(aorb_discrete_stu, prog2016), prog2016)
ctof_failcounts = failmap(clean(ctof_discrete_stu, prog2016), prog2016)
ax = sns.heatmap(aorb_failcounts,cmap = "inferno", annot=aorb_failcounts)
ax.set(title="Course failures for students with A or B in discrete math (first try)")
[<matplotlib.text.Text at 0x11a42d208>]
ax = sns.heatmap(ctof_failcounts,cmap = "inferno", annot=ctof_failcounts)
ax.set(title="Course failures for students with C through F in discrete math (first try)")
[<matplotlib.text.Text at 0x11a27f9b0>]
justc_discrete_stu = {k: v for k, v in discrete_stu.items() if got(v, 'CCOM3020', "C")}
len(justc_discrete_stu)
67
justc_failcounts = failmap(clean(justc_discrete_stu, prog2016), prog2016)
ax = sns.heatmap(heatmap(clean(justc_discrete_stu, prog2016), prog2016),cmap = None)
ax = sns.heatmap(justc_failcounts,cmap = "inferno", annot=justc_failcounts)
ax.set(title="Course failures for students with C in discrete math (first try)")
[<matplotlib.text.Text at 0x11ac2abe0>]
What about calculus? That's another requirement for students wanting to reclassify.
calc_stu = {k: v for k, v in sequences.items() if took(v, 'MATE3151')}
len(calc_stu)
283
aorb_calc_stu = {k: v for k, v in discrete_stu.items() if got(v, 'MATE3151', "AB")}
len(aorb_calc_stu)
68
justc_calc_stu = {k: v for k, v in discrete_stu.items() if got(v, 'MATE3151', "C")}
len(justc_calc_stu)
42
ax = sns.heatmap(heatmap(clean(aorb_calc_stu, prog2016), prog2016),cmap = None)
ax = sns.heatmap(heatmap(clean(justc_calc_stu, prog2016), prog2016),cmap = None)
justc_calc_failcounts = failmap(clean(justc_calc_stu, prog2016), prog2016)
ax = sns.heatmap(justc_calc_failcounts,cmap = "inferno", annot=justc_calc_failcounts)
ax.set(title="Course failures for students with C in calculus 1 (first try)")
[<matplotlib.text.Text at 0x11b5765f8>]
dtof_calc_stu = {k: v for k, v in discrete_stu.items() if got(v, 'MATE3151', "DFW")}
dtof_calc_failcounts = failmap(clean(dtof_calc_stu, prog2016), prog2016)
ax = sns.heatmap(dtof_calc_failcounts,cmap = "inferno", annot=dtof_calc_failcounts)
ax.set(title="Course failures for students with D,F or W in calculus 1 (first try)")
[<matplotlib.text.Text at 0x11b329b70>]
Wow, that's an interesting last column. The last column represents students taking courses more than 8 years after their first semester. Students with a D or F in calculus are still failing math (and compilers) more than 8 years after starting!
 
Discussion
I was surprised by the failure rate in introductory CS courses by students with pre-calculus.
I was surprised data structures isn't a bottleneck.
The advanced algebra courses are a significant source of delay in graduation.