Source code for pykt.preprocess.poj_preprocess

import pandas as pd
from .utils import sta_infos, write_txt, change2timestamp

KEYS = ["User", "Problem"]
[docs]def read_data_from_csv(read_file, write_file): stares = [] df = pd.read_csv(read_file) ins, us, qs, cs, avgins, avgcq, na = sta_infos(df, KEYS, stares) print(f"original interaction num: {ins}, user num: {us}, question num: {qs}, concept num: {cs}, avg(ins) per s: {avgins}, avg(c) per q: {avgcq}, na: {na}") df["index"] = range(df.shape[0]) df = df.dropna(subset=["User", "Problem", "Result", "Submit Time"]) df = df[df["Result"].isin(['Accepted', 'Wrong Answer', 'Compile Error', 'Time Limit Exceeded', 'Memory Limit Exceeded', 'Runtime Error', 'Output Limit Exceeded', 'Presentation Error', 'System Error', 'Validator Error'])] # 'Waiting' 'Running & Judging' 'Compiling''])] df.loc[:, "Result"] = df.loc[:, "Result"].apply(lambda k: "1" if k == "Accepted" else "0") ins, us, qs, cs, avgins, avgcq, na = sta_infos(df, KEYS, stares) print(f"after drop interaction num: {ins}, user num: {us}, question num: {qs}, concept num: {cs}, avg(ins) per s: {avgins}, avg(c) per q: {avgcq}, na: {na}") data = [] ui_df = df.groupby(['User'], sort=False) for ui in ui_df: uid, curdf = ui[0], ui[1] curdf.loc[:, "Submit Time"] = curdf.loc[:, "Submit Time"].apply(lambda t: change2timestamp(t, False)) curdf = curdf.sort_values(by=["Submit Time", "index"]) # problem -> concept concepts = curdf["Problem"].astype(str) responses = curdf["Result"] timestamps = curdf["Submit Time"].astype(str) questions = ["NA"] usetimes = ["NA"] uids = [str(uid), str(len(responses))] data.append([uids, questions, concepts, responses, timestamps, usetimes]) if len(data) % 1000 == 0: print(len(data)) write_txt(write_file, data) print("\n".join(stares)) return