""" Script to extract depdendency and XML markup information from data in the conll10 and CWB XML formats. """ import os, re import ntpath class ParsedToken: def __init__(self, tok_id, text, lemma, pos, morph, head, func): self.id = tok_id self.text = text.strip() self.text_lower = text.lower() self.pos = pos self.lemma = lemma if lemma != "_" else text self.morph = morph self.head = head self.func = func self.heading = "_" self.caption = "_" self.list = "_" self.date = "_" self.s_type = "_" def __repr__(self): return str(self.text) + " (" + str(self.pos) + "/" + str(self.lemma) + ") " + "<-" + str(self.func) + "- " + str(self.head_text) def get_tok_info(docname,corpus_root): if corpus_root[-1]!=os.sep: corpus_root += os.sep xml_file = corpus_root + "xml" + os.sep + docname + ".xml" conll_file = corpus_root + "dep" + os.sep + docname + ".conll10" tokens = [] for line in open(conll_file).read().replace("\r","").split("\n"): if "\t" in line: cols = line.split("\t") tokens.append(ParsedToken(cols[0],cols[1],cols[2],cols[3],cols[5],cols[6],cols[7])) counter = 0 heading = "_" caption = "_" date = "_" list = "_" s_type = "_" para = "_" item = "_" for line in open(xml_file).read().replace("\r", "").split("\n"): if "' in line: para = "open_para" elif '' in line: item = "open_item" if "\t" in line: tokens[counter].heading = heading tokens[counter].caption = caption tokens[counter].list = list tokens[counter].s_type = s_type tokens[counter].date = date tokens[counter].para = para tokens[counter].item = item para = "_" item = "_" counter += 1 return tokens