Source code for pykt.preprocess.nips_task34_preprocess

import os
import pandas as pd
from .utils import sta_infos, write_txt, change2timestamp,format_list2str



[docs]def load_nips_data(primary_data_path,meta_data_dir,task_name): """The data downloaded from https://competitions.codalab.org/competitions/25449 The document can be downloaded from https://arxiv.org/abs/2007.12061. Args: primary_data_path (_type_): premary data path meta_data_dir (_type_): metadata dir task_name (_type_): task_1_2 or task_3_4 Returns: dataframe: the merge df """ print("Start load data") answer_metadata_path = os.path.join(meta_data_dir,f"answer_metadata_{task_name}.csv") question_metadata_path = os.path.join(meta_data_dir,f"question_metadata_{task_name}.csv") student_metadata_path = os.path.join(meta_data_dir,f"student_metadata_{task_name}.csv") subject_metadata_path = os.path.join(meta_data_dir,f"subject_metadata.csv") df_primary = pd.read_csv(primary_data_path) print(f"len df_primary is {len(df_primary)}") #add timestamp df_answer = pd.read_csv(answer_metadata_path) df_answer['answer_timestamp'] = df_answer['DateAnswered'].apply(change2timestamp) df_question = pd.read_csv(question_metadata_path) # df_student = pd.read_csv(student_metadata_path) df_subject = pd.read_csv(subject_metadata_path) #only keep level 3 keep_subject_ids = set(df_subject[df_subject['Level']==3]['SubjectId']) df_question['SubjectId_level3'] = df_question['SubjectId'].apply(lambda x:set(eval(x))&keep_subject_ids) #merge data df_merge = df_primary.merge(df_answer[['AnswerId','answer_timestamp']],how='left')#merge answer time df_merge = df_merge.merge(df_question[["QuestionId","SubjectId_level3"]],how='left')#merge question subjects df_merge['SubjectId_level3_str'] = df_merge['SubjectId_level3'].apply(lambda x:"_".join([str(i) for i in x])) print(f"len df_merge is {len(df_merge)}") print("Finish load data") print(f"Num of student {df_merge['UserId'].unique().size}") print(f"Num of question {df_merge['QuestionId'].unique().size}") kcs =[] for item in df_merge['SubjectId_level3'].values: kcs.extend(item) print(f"Num of knowledge {len(set(kcs))}") return df_merge
[docs]def get_user_inters(df): """convert df to user sequences Args: df (_type_): the merged df Returns: List: user_inters """ user_inters = [] for user, group in df.groupby("UserId", sort=False): group = group.sort_values(["answer_timestamp","tmp_index"], ascending=True) seq_skills = group['SubjectId_level3_str'].tolist() seq_ans = group['IsCorrect'].tolist() seq_response_cost = ["NA"] seq_start_time = group['answer_timestamp'].tolist() seq_problems = group['QuestionId'].tolist() seq_len = len(group) user_inters.append( [[str(user), str(seq_len)], format_list2str(seq_problems), format_list2str(seq_skills), format_list2str(seq_ans), format_list2str(seq_start_time), format_list2str(seq_response_cost)]) return user_inters
KEYS = ["UserId", "SubjectId_level3_str", "QuestionId"]
[docs]def read_data_from_csv(primary_data_path,meta_data_dir,task_name,write_file): stares= [] df = load_nips_data(primary_data_path,meta_data_dir,task_name) ins, us, qs, cs, avgins, avgcq, na = sta_infos(df, KEYS, stares) print(f"original interaction num: {ins}, user num: {us}, question num: {qs}, concept num: {cs}, avg(ins) per s: {avgins}, avg(c) per q: {avgcq}, na: {na}") df['tmp_index'] = range(len(df)) df = df.dropna(subset=["UserId","answer_timestamp", "SubjectId_level3_str", "IsCorrect", "answer_timestamp","QuestionId"]) ins, us, qs, cs, avgins, avgcq, na = sta_infos(df, KEYS, stares) print(f"after drop interaction num: {ins}, user num: {us}, question num: {qs}, concept num: {cs}, avg(ins) per s: {avgins}, avg(c) per q: {avgcq}, na: {na}") user_inters = get_user_inters(df) write_txt(write_file, user_inters)