Source code for service.preclass.processors.qa_utils

[docs] def get_question_type(q): """ Determines if a question is multiple or single choice and strips the choice indicator. Args: q (str): The question text containing choice type indicators Returns: tuple: (stripped_question, question_type) - stripped_question (str): Question text with choice indicators removed - question_type (str): Either 'multiple choice' or 'single choice' """ if '多选' in q: # multiple choice question_type = "multiple choice" stripped_question = q.rstrip('(多选)').lstrip('(多选)').rstrip('(多选)').lstrip('(多选)').rstrip('[多选]').lstrip('(多选)') else: question_type = 'single choice' stripped_question = q.rstrip('(单选)').lstrip('(单选)').rstrip('(单选)').lstrip('(单选)').rstrip('[单选]').lstrip('[单选]') return stripped_question, question_type
[docs] def split_ans(ans_text): """ Converts answer choices (A, B, C, D, E) to their corresponding indices. Args: ans_text (str): String containing answer choices (e.g., 'A', 'BC', 'A,B,C') Returns: list: List of integer indices corresponding to the answer choices (e.g., [0] for A, [1,2] for BC) """ selects_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4} # ans_text = ans_text.split(':')[-1].strip() # if ', ' in ans_text: # ans_lis = ans_text.split(', ') # elif '、' in ans_text: # ans_lis = ans_text.split('、') # elif ',' in ans_text: # ans_lis = ans_text.split(',') # elif ' ' in ans_text: # ans_lis = ans_text.split(' ') # else: # single or ABC # ans_lis = [] # for c in ans_text: # ans_lis.append(c) ans_index_lis = [] # for ans in ans_lis: # ans_index_lis.append(selects_map[ans]) for i in ans_text: if i in selects_map: ans_index_lis.append(selects_map[i]) return ans_index_lis
[docs] def strip_select(select_text): """ Removes answer choice prefixes (A., B., C., etc.) from selection text. Args: select_text (str): Text containing answer choice prefix Returns: str: Clean text with prefix removed """ return select_text.lstrip('A. ').lstrip('B. ').lstrip('C. ').lstrip('D. ').lstrip('E. ').lstrip('A.').lstrip('B.').lstrip('C.').lstrip('D.').lstrip('E.').lstrip('A ').lstrip('B ').lstrip('C ').lstrip('D ').lstrip('E ').lstrip('A').lstrip('B').lstrip('C').lstrip('D').lstrip('E')
import copy
[docs] def parse_qa(questions): """ Parses a formatted string of questions into structured question-answer objects. Args: questions (str): Multi-line string containing questions, choices, answers, and references Returns: list: List of dictionaries containing parsed QA data with keys: - question (str): The question text - question_type (str): Type of question ('multiple choice' or 'single choice') - selects (list): List of answer choices - answer (list): List of correct answer indices - reference (str): Reference text for the question """ theme_qas = [] cnt = 0 questions_list = questions.split("\n\n") q = "q" selects = [] for ques in questions_list: # '问题': quesiton if '问题' in ques and 'A' in ques and 'B' in ques and 'C' in ques and 'D' in ques: ques = ques.split('问题', 1) ques = ques[1] ques = ques.lstrip('问题').lstrip(":") ques_list = ques.split('\n') if "答案:" in ques: # '答案': answer if 'A' not in ques_list[1] and len(ques_list[0]) < 10: q = ques_list[1] q_list = q.split(":", 1) if len(q_list) > 1: if len(q_list[0]) < 2: q = q_list[1] else: if '单选' in q_list[1] or '多选' in q_list[1]: q = q_list[0] + q_list[1] else: q = q_list[0] else: q = q_list[0] ques_index = 2 if '单选' or '多选' in ques_list[0]: _, question_type = get_question_type(q) else: # '(单选)' or '(多选)' in ques q, question_type = get_question_type(q) else: q = ques_list[0] q_list = q.split(":", 1) if len(q_list) > 1: if len(q_list[0]) < 2: q = q_list[1] else: if '单选' in q_list[1] or '多选' in q_list[1]: q = q_list[0] + q_list[1] else: q = q_list[0] else: q = q_list[0] ques_index = 1 q, question_type = get_question_type(q) selects = [] ans_index = 0 for i, que in enumerate(ques_list[ques_index:]): if "答案:" in que: ans = que.lstrip('答案:') ans = split_ans(ans) ans_index = i + ques_index break selects.append(strip_select(que)) if "引用文本:" in ques: # reference for que in ques_list[ans_index + 1:]: if "引用文本:" in que: ref = que.lstrip('引用文本:') break theme_qas.append({'question': q, 'question_type': question_type, 'selects': copy.deepcopy(selects), 'answer': ans, 'reference': ref}) cnt += 1 else: if 'A' not in ques_list[1] and len(ques_list[0]) < 10: q = ques_list[1] q_list = q.split(":", 1) if len(q_list) > 1: if len(q_list[0]) < 2: q = q_list[1] else: if '单选' in q_list[1] or '多选' in q_list[1]: q = q_list[0] + q_list[1] else: q = q_list[0] else: q = q_list[0] ques_index = 2 if '单选' or '多选' in ques_list[0]: _, question_type = get_question_type(q) else: # '(单选)' or '(多选)' in ques q, question_type = get_question_type(q) else: q = ques_list[0] q_list = q.split(":", 1) if len(q_list) > 1: if len(q_list[0]) < 2: q = q_list[1] else: if '单选' in q_list[1] or '多选' in q_list[1]: q = q_list[0] + q_list[1] else: q = q_list[0] else: q = q_list[0] ques_index = 1 q, question_type = get_question_type(q) selects = [strip_select(select) for select in ques_list[ques_index: ]] elif "答案:" in ques: if q == "q": continue if "引用文本" in ques: que_list = ques.split('\n') ans = que_list[0].lstrip('答案:') ans = split_ans(ans) ref = que_list[1].lstrip('引用文本:') theme_qas.append({'question': q, 'question_type': question_type, 'selects': selects, 'answer': ans, 'reference': ref}) cnt += 1 else: ans = ques.lstrip("答案:") ans = split_ans(ans) elif "引用文本:" in ques: if q == "q": continue ref = ques.lstrip('引用文本:') theme_qas.append({'question': q, 'question_type': question_type, 'selects': selects, 'answer': ans, 'reference': ref}) cnt += 1 else: continue return theme_qas