Source code for pykt.preprocess.junyi2015_preprocess

import pandas as pd
from .utils import sta_infos, write_txt, replace_text

[docs]def load_q2c(qname): df = pd.read_csv(qname, encoding = "utf-8",low_memory=False).dropna(subset=["name", "topic"]) dq2c = dict() for name, topic in zip(df["name"], df["topic"]): if name not in dq2c: dq2c[name] = topic else: print(f"already has topic in dict: {name}: {topic}, {dq2c[name]}") print(f"dq2c: {len(dq2c)}") return dq2c
KEYS = ["user_id", "topic", "exercise"]
[docs]def read_data_from_csv(read_file, write_file, dq2c): stares = [] df = pd.read_csv(read_file) df["topic"] = df["exercise"].apply(lambda q: "NANA" if q not in dq2c else dq2c[q]) df["exercise"] = df["exercise"].apply(replace_text) df["topic"] = df["topic"].apply(replace_text) df = df[df["topic"] != "NANA"] ins, us, qs, cs, avgins, avgcq, na = sta_infos(df, KEYS, stares) print(f"original interaction num: {ins}, user num: {us}, question num: {qs}, concept num: {cs}, avg(ins) per s: {avgins}, avg(c) per q: {avgcq}, na: {na}") df["index"] = range(df.shape[0]) print(f"original recores shape: {df.shape}") usedf = df[["index", "user_id", "exercise", "time_done", "time_taken_attempts", "correct", "count_attempts", "topic"]] usedf = usedf.dropna(subset=["user_id", "exercise", "time_done", "correct"]) usedf = usedf[usedf["correct"].isin([False, True])] usedf["time_taken_attempts"] = (usedf["time_taken_attempts"].fillna(-100)).astype(str) # only hint! False correct usedf.loc[:, "time_taken_attempts"] = usedf["time_taken_attempts"].astype(str).apply(lambda x: int(x.split("&")[0])*1000).astype(str) usedf.loc[:, "time_done"] = usedf["time_done"].astype(int) ins, us, qs, cs, avgins, avgcq, na = sta_infos(usedf, KEYS, stares) print(f"after drop interaction num: {ins}, user num: {us}, question num: {qs}, concept num: {cs}, avg(ins) per s: {avgins}, avg(c) per q: {avgcq}, na: {na}") data = [] uids = usedf.user_id.unique() problems = usedf.exercise.unique() print(f"usedf: {usedf.shape}, uids: {len(uids)}, problems: {len(problems)}") ui_df = usedf.groupby(['user_id'], sort=False) for ui in ui_df: uid, curdf = ui[0], ui[1] curdf = curdf.sort_values(by=["time_done", "index"]) curdf["time_done"] = curdf["time_done"].apply(lambda x: round(int(x) / 1000)).astype(str) # ms questions = curdf["exercise"].tolist() concepts = curdf["topic"].tolist() rs = curdf["correct"].astype(int).astype(str).tolist() ts = curdf["time_done"].tolist() uts = curdf["time_taken_attempts"].tolist() seq_len = len(rs) uc = [str(uid), str(seq_len)] data.append([uc, questions, concepts, rs, ts, uts]) if len(data) % 1000 == 0: print(len(data)) write_txt(write_file, data) print("\n".join(stares)) return