Source code for streamcorpus.ttypes

#
# Autogenerated by Thrift Compiler (0.9.0)
#
# DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
#
#  options string: py:new_style,slots
#

from thrift.Thrift import TType, TMessageType, TException, TApplicationException

from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol, TProtocol
try:
  from thrift.protocol import fastbinary
except:
  fastbinary = None


class OffsetType(object):
  """
  Offset and OffsetType are used by Annotation to identify the
  portion of a ContentItem that a human labeled with a tag.

  annotation applies to a range of line numbers

  annotation applies to a range of bytes

  annotation applies to a range of chars, typically unicode chars

  annotation applies to a range defined by xpaths (with relative char offsets)
  """
  LINES = 0
  BYTES = 1
  CHARS = 2
  XPATH_CHARS = 3

  _VALUES_TO_NAMES = {
    0: "LINES",
    1: "BYTES",
    2: "CHARS",
    3: "XPATH_CHARS",
  }

  _NAMES_TO_VALUES = {
    "LINES": 0,
    "BYTES": 1,
    "CHARS": 2,
    "XPATH_CHARS": 3,
  }

class FlagType(object):
  """
  General purpose flags. These flags can be used to mark documents as
  meeting an extensible set of criteria.
  """
  PROFILE = 0

  _VALUES_TO_NAMES = {
    0: "PROFILE",
  }

  _NAMES_TO_VALUES = {
    "PROFILE": 0,
  }

[docs]class EntityType(object): """ Different tagging tools have different strings for labeling the various common entity types. To avoid ambiguity, we define a canonical list here, which we will surely have to expand over time as new taggers recognize new types of entities. LOC: physical location MISC: uncategorized named entities, e.g. Civil War for Stanford CoreNLP """ PER = 0 ORG = 1 LOC = 2 TIME = 5 DATE = 6 MONEY = 7 PERCENT = 8 MISC = 9 GPE = 10 FAC = 11 VEH = 12 WEA = 13 phone = 14 email = 15 URL = 16 CUSTOM_TYPE = 17 LIST = 18 RELIGION = 19 NATIONALITY = 20 TITLE = 21 EVENT = 22 _VALUES_TO_NAMES = { 0: "PER", 1: "ORG", 2: "LOC", 5: "TIME", 6: "DATE", 7: "MONEY", 8: "PERCENT", 9: "MISC", 10: "GPE", 11: "FAC", 12: "VEH", 13: "WEA", 14: "phone", 15: "email", 16: "URL", 17: "CUSTOM_TYPE", 18: "LIST", 19: "RELIGION", 20: "NATIONALITY", 21: "TITLE", 22: "EVENT", } _NAMES_TO_VALUES = { "PER": 0, "ORG": 1, "LOC": 2, "TIME": 5, "DATE": 6, "MONEY": 7, "PERCENT": 8, "MISC": 9, "GPE": 10, "FAC": 11, "VEH": 12, "WEA": 13, "phone": 14, "email": 15, "URL": 16, "CUSTOM_TYPE": 17, "LIST": 18, "RELIGION": 19, "NATIONALITY": 20, "TITLE": 21, "EVENT": 22, }
[docs]class MentionType(object): NAME = 0 PRO = 1 NOM = 2 _VALUES_TO_NAMES = { 0: "NAME", 1: "PRO", 2: "NOM", } _NAMES_TO_VALUES = { "NAME": 0, "PRO": 1, "NOM": 2, }
[docs]class Gender(object): FEMALE = 0 MALE = 1 _VALUES_TO_NAMES = { 0: "FEMALE", 1: "MALE", } _NAMES_TO_VALUES = { "FEMALE": 0, "MALE": 1, }
[docs]class AttributeType(object): """ Attributes are based primarily on TAC KBP, see also saved in this directory http://surdeanu.info/kbp2013/TAC_2013_KBP_Slot_Descriptions_1.0.pdf Only slots that are not resolvable to unique entities are listed here as attributes. Most slots are relations, so see RelationType. """ PER_AGE = 0 PER_GENDER = 1 PER_ALTERNATE_NAMES = 3 PER_CAUSE_OF_DEATH = 4 PER_TITLE = 5 PER_CHARGES = 6 ORG_ALTERNATE_NAMES = 7 ORG_NUMBER_OF_EMPLOYEES_MEMBERS = 8 _VALUES_TO_NAMES = { 0: "PER_AGE", 1: "PER_GENDER", 3: "PER_ALTERNATE_NAMES", 4: "PER_CAUSE_OF_DEATH", 5: "PER_TITLE", 6: "PER_CHARGES", 7: "ORG_ALTERNATE_NAMES", 8: "ORG_NUMBER_OF_EMPLOYEES_MEMBERS", } _NAMES_TO_VALUES = { "PER_AGE": 0, "PER_GENDER": 1, "PER_ALTERNATE_NAMES": 3, "PER_CAUSE_OF_DEATH": 4, "PER_TITLE": 5, "PER_CHARGES": 6, "ORG_ALTERNATE_NAMES": 7, "ORG_NUMBER_OF_EMPLOYEES_MEMBERS": 8, }
class ZoneType(object): UNZONED = 0 HEADER = 1 TITLE = 2 BODY = 3 FOOTER = 4 _VALUES_TO_NAMES = { 0: "UNZONED", 1: "HEADER", 2: "TITLE", 3: "BODY", 4: "FOOTER", } _NAMES_TO_VALUES = { "UNZONED": 0, "HEADER": 1, "TITLE": 2, "BODY": 3, "FOOTER": 4, }
[docs]class RelationType(object): """ RelationType is used in Relation to map relation "name" to type. Relations 0 through 50 borrow from ACE with these string replacements: s/-// and s/./_/ http://projects.ldc.upenn.edu/ace/docs/English-Events-Guidelines_v5.4.3.pdf Relations 51- borrows from KBP slot filling http://surdeanu.info/kbp2013/TAC_2013_KBP_Slot_Descriptions_1.0.pdf Most entity slots are relations, so the PER_ and ORG_ and FAC_ relations listed below are primary for slot filling. Many of the KBP-based slots are redundant or overlapping with the ACE-based slots. The KBP-based slots are generally simpler and were developed to support knowledge base population rather than single-document extraction (as ACE was). Therefore, for KB-focused tasks, we recommend using the Relations 51- """ PHYS_Located = 0 PHYS_Near = 1 PARTWHOLE_Geographical = 2 PARTWHOLE_Subsidiary = 3 PARTWHOLE_Artifact = 4 PERSOC_Business = 5 PERSOC_Family = 6 PERSOC_LastingPersonal = 7 ORGAFF_Employment = 8 ORGAFF_Ownership = 9 ORGAFF_Founder = 10 ORGAFF_StudentAlum = 11 ORGAFF_SportsAffiliation = 12 ORGAFF_InvestorShareholder = 13 ORGAFF_Membership = 14 ART_UserOwnerInventorManufacturer = 15 GENAFF_CitizenResidentReligionEthnicity = 16 GENAFF_OrgLocation = 17 Business_DeclareBankruptcy = 18 Business_EndOrg = 19 Business_MergeOrg = 20 Business_StartOrg = 21 Conflict_Attack = 22 Conflict_Demonstrate = 23 Contact_PhoneWrite = 24 Contact_Meet = 25 Justice_Acquit = 26 Justice_Appeal = 27 Justice_ArrestJail = 28 Justice_ChargeIndict = 29 Justice_Convict = 30 Justice_Execute = 31 Justice_Extradite = 32 Justice_Fine = 33 Justice_Pardon = 34 Justice_ReleaseParole = 35 Justice_Sentence = 36 Justice_Sue = 37 Justice_TrialHearing = 38 Life_BeBorn = 39 Life_Die = 40 Life_Divorce = 41 Life_Injure = 42 Life_Marry = 43 Movement_Transport = 44 Personnel_Elect = 45 Personnel_EndPosition = 46 Personnel_Nominate = 47 Personnel_StartPosition = 48 Transaction_TransferMoney = 49 Transaction_TransferOwnership = 50 PER_DATE_OF_BIRTH = 51 PER_COUNTRY_OF_BIRTH = 52 PER_STATEORPROVINCE_OF_BIRTH = 53 PER_CITY_OF_BIRTH = 54 PER_ORIGIN = 55 PER_DATE_OF_DEATH = 56 PER_COUNTRY_OF_DEATH = 57 PER_STATEORPROVINCE_OF_DEATH = 58 PER_CITY_OF_DEATH = 59 PER_COUNTRIES_OF_RESIDENCE = 60 PER_STATESORPROVINCES_OF_RESIDENCE = 61 PER_CITIES_OF_RESIDENCE = 62 PER_SCHOOLS_ATTENDED = 63 PER_EMPLOYEE_OR_MEMBER_OF = 64 PER_RELIGION = 65 PER_SPOUSE = 66 PER_CHILDREN = 67 PER_PARENTS = 68 PER_SIBLINGS = 69 PER_OTHER_FAMILY = 70 ORG_TOP_MEMBERS_EMPLOYEES = 71 ORG_MEMBERS = 72 ORG_MEMBER_OF = 73 ORG_SUBSIDIARIES = 74 ORG_PARENTS = 75 ORG_FOUNDED_BY = 76 ORG_DATE_FOUNDED = 77 ORG_DATE_DISSOLVED = 78 ORG_COUNTRY_OF_HEADQUARTERS = 79 ORG_STATEORPROVINCE_OF_HEADQUARTERS = 80 ORG_CITY_OF_HEADQUARTERS = 81 ORG_SHAREHOLDERS = 82 ORG_POLITICAL_OR_RELIGIOUS_AFFILIATION = 83 ORG_WEBSITE = 84 FAC_LOCATED = 85 FAC_VISITED_BY = 86 FAC_OWNER = 87 PER_WON_AWARD = 88 PER_MET_WITH = 89 PER_ATTENDED = 90 PER_VISITED = 91 ORG_ATTENDED = 92 ORG_VISITED = 93 PER_WEBSITE = 94 PER_NATIONALITY = 95 _VALUES_TO_NAMES = { 0: "PHYS_Located", 1: "PHYS_Near", 2: "PARTWHOLE_Geographical", 3: "PARTWHOLE_Subsidiary", 4: "PARTWHOLE_Artifact", 5: "PERSOC_Business", 6: "PERSOC_Family", 7: "PERSOC_LastingPersonal", 8: "ORGAFF_Employment", 9: "ORGAFF_Ownership", 10: "ORGAFF_Founder", 11: "ORGAFF_StudentAlum", 12: "ORGAFF_SportsAffiliation", 13: "ORGAFF_InvestorShareholder", 14: "ORGAFF_Membership", 15: "ART_UserOwnerInventorManufacturer", 16: "GENAFF_CitizenResidentReligionEthnicity", 17: "GENAFF_OrgLocation", 18: "Business_DeclareBankruptcy", 19: "Business_EndOrg", 20: "Business_MergeOrg", 21: "Business_StartOrg", 22: "Conflict_Attack", 23: "Conflict_Demonstrate", 24: "Contact_PhoneWrite", 25: "Contact_Meet", 26: "Justice_Acquit", 27: "Justice_Appeal", 28: "Justice_ArrestJail", 29: "Justice_ChargeIndict", 30: "Justice_Convict", 31: "Justice_Execute", 32: "Justice_Extradite", 33: "Justice_Fine", 34: "Justice_Pardon", 35: "Justice_ReleaseParole", 36: "Justice_Sentence", 37: "Justice_Sue", 38: "Justice_TrialHearing", 39: "Life_BeBorn", 40: "Life_Die", 41: "Life_Divorce", 42: "Life_Injure", 43: "Life_Marry", 44: "Movement_Transport", 45: "Personnel_Elect", 46: "Personnel_EndPosition", 47: "Personnel_Nominate", 48: "Personnel_StartPosition", 49: "Transaction_TransferMoney", 50: "Transaction_TransferOwnership", 51: "PER_DATE_OF_BIRTH", 52: "PER_COUNTRY_OF_BIRTH", 53: "PER_STATEORPROVINCE_OF_BIRTH", 54: "PER_CITY_OF_BIRTH", 55: "PER_ORIGIN", 56: "PER_DATE_OF_DEATH", 57: "PER_COUNTRY_OF_DEATH", 58: "PER_STATEORPROVINCE_OF_DEATH", 59: "PER_CITY_OF_DEATH", 60: "PER_COUNTRIES_OF_RESIDENCE", 61: "PER_STATESORPROVINCES_OF_RESIDENCE", 62: "PER_CITIES_OF_RESIDENCE", 63: "PER_SCHOOLS_ATTENDED", 64: "PER_EMPLOYEE_OR_MEMBER_OF", 65: "PER_RELIGION", 66: "PER_SPOUSE", 67: "PER_CHILDREN", 68: "PER_PARENTS", 69: "PER_SIBLINGS", 70: "PER_OTHER_FAMILY", 71: "ORG_TOP_MEMBERS_EMPLOYEES", 72: "ORG_MEMBERS", 73: "ORG_MEMBER_OF", 74: "ORG_SUBSIDIARIES", 75: "ORG_PARENTS", 76: "ORG_FOUNDED_BY", 77: "ORG_DATE_FOUNDED", 78: "ORG_DATE_DISSOLVED", 79: "ORG_COUNTRY_OF_HEADQUARTERS", 80: "ORG_STATEORPROVINCE_OF_HEADQUARTERS", 81: "ORG_CITY_OF_HEADQUARTERS", 82: "ORG_SHAREHOLDERS", 83: "ORG_POLITICAL_OR_RELIGIOUS_AFFILIATION", 84: "ORG_WEBSITE", 85: "FAC_LOCATED", 86: "FAC_VISITED_BY", 87: "FAC_OWNER", 88: "PER_WON_AWARD", 89: "PER_MET_WITH", 90: "PER_ATTENDED", 91: "PER_VISITED", 92: "ORG_ATTENDED", 93: "ORG_VISITED", 94: "PER_WEBSITE", 95: "PER_NATIONALITY", } _NAMES_TO_VALUES = { "PHYS_Located": 0, "PHYS_Near": 1, "PARTWHOLE_Geographical": 2, "PARTWHOLE_Subsidiary": 3, "PARTWHOLE_Artifact": 4, "PERSOC_Business": 5, "PERSOC_Family": 6, "PERSOC_LastingPersonal": 7, "ORGAFF_Employment": 8, "ORGAFF_Ownership": 9, "ORGAFF_Founder": 10, "ORGAFF_StudentAlum": 11, "ORGAFF_SportsAffiliation": 12, "ORGAFF_InvestorShareholder": 13, "ORGAFF_Membership": 14, "ART_UserOwnerInventorManufacturer": 15, "GENAFF_CitizenResidentReligionEthnicity": 16, "GENAFF_OrgLocation": 17, "Business_DeclareBankruptcy": 18, "Business_EndOrg": 19, "Business_MergeOrg": 20, "Business_StartOrg": 21, "Conflict_Attack": 22, "Conflict_Demonstrate": 23, "Contact_PhoneWrite": 24, "Contact_Meet": 25, "Justice_Acquit": 26, "Justice_Appeal": 27, "Justice_ArrestJail": 28, "Justice_ChargeIndict": 29, "Justice_Convict": 30, "Justice_Execute": 31, "Justice_Extradite": 32, "Justice_Fine": 33, "Justice_Pardon": 34, "Justice_ReleaseParole": 35, "Justice_Sentence": 36, "Justice_Sue": 37, "Justice_TrialHearing": 38, "Life_BeBorn": 39, "Life_Die": 40, "Life_Divorce": 41, "Life_Injure": 42, "Life_Marry": 43, "Movement_Transport": 44, "Personnel_Elect": 45, "Personnel_EndPosition": 46, "Personnel_Nominate": 47, "Personnel_StartPosition": 48, "Transaction_TransferMoney": 49, "Transaction_TransferOwnership": 50, "PER_DATE_OF_BIRTH": 51, "PER_COUNTRY_OF_BIRTH": 52, "PER_STATEORPROVINCE_OF_BIRTH": 53, "PER_CITY_OF_BIRTH": 54, "PER_ORIGIN": 55, "PER_DATE_OF_DEATH": 56, "PER_COUNTRY_OF_DEATH": 57, "PER_STATEORPROVINCE_OF_DEATH": 58, "PER_CITY_OF_DEATH": 59, "PER_COUNTRIES_OF_RESIDENCE": 60, "PER_STATESORPROVINCES_OF_RESIDENCE": 61, "PER_CITIES_OF_RESIDENCE": 62, "PER_SCHOOLS_ATTENDED": 63, "PER_EMPLOYEE_OR_MEMBER_OF": 64, "PER_RELIGION": 65, "PER_SPOUSE": 66, "PER_CHILDREN": 67, "PER_PARENTS": 68, "PER_SIBLINGS": 69, "PER_OTHER_FAMILY": 70, "ORG_TOP_MEMBERS_EMPLOYEES": 71, "ORG_MEMBERS": 72, "ORG_MEMBER_OF": 73, "ORG_SUBSIDIARIES": 74, "ORG_PARENTS": 75, "ORG_FOUNDED_BY": 76, "ORG_DATE_FOUNDED": 77, "ORG_DATE_DISSOLVED": 78, "ORG_COUNTRY_OF_HEADQUARTERS": 79, "ORG_STATEORPROVINCE_OF_HEADQUARTERS": 80, "ORG_CITY_OF_HEADQUARTERS": 81, "ORG_SHAREHOLDERS": 82, "ORG_POLITICAL_OR_RELIGIOUS_AFFILIATION": 83, "ORG_WEBSITE": 84, "FAC_LOCATED": 85, "FAC_VISITED_BY": 86, "FAC_OWNER": 87, "PER_WON_AWARD": 88, "PER_MET_WITH": 89, "PER_ATTENDED": 90, "PER_VISITED": 91, "ORG_ATTENDED": 92, "ORG_VISITED": 93, "PER_WEBSITE": 94, "PER_NATIONALITY": 95, }
class Versions(object): """ Versions of this protocol are enumerated so that when we expand, everybody can see which version a particular data file used. v0_1_0 refers to the kba.thrift definition, which was before Versions was included in the spec. """ v0_2_0 = 0 v0_3_0 = 1 _VALUES_TO_NAMES = { 0: "v0_2_0", 1: "v0_3_0", } _NAMES_TO_VALUES = { "v0_2_0": 0, "v0_3_0": 1, } class StreamTime(object): """ StreamTime is a timestamp measured in seconds since the 1970 epoch. epoch_ticks is always in the UTC timezone. This is used in several structs below to record various moments in history. Implementations of these interfaces in various languages may provide convenience methods for insuring that these two fields are consistent with each other. Attributes: - epoch_ticks - zulu_timestamp """ __slots__ = [ 'epoch_ticks', 'zulu_timestamp', ] thrift_spec = ( None, # 0 (1, TType.DOUBLE, 'epoch_ticks', None, None, ), # 1 (2, TType.STRING, 'zulu_timestamp', None, None, ), # 2 ) def __init__(self, epoch_ticks=None, zulu_timestamp=None,): self.epoch_ticks = epoch_ticks self.zulu_timestamp = zulu_timestamp def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.DOUBLE: self.epoch_ticks = iprot.readDouble(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.zulu_timestamp = iprot.readString(); else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('StreamTime') if self.epoch_ticks is not None: oprot.writeFieldBegin('epoch_ticks', TType.DOUBLE, 1) oprot.writeDouble(self.epoch_ticks) oprot.writeFieldEnd() if self.zulu_timestamp is not None: oprot.writeFieldBegin('zulu_timestamp', TType.STRING, 2) oprot.writeString(self.zulu_timestamp) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Annotator(object): """ An Annotator object describes a human (or possibly a set of humans) who generated the data stored in a Label or Rating object. Attributes: - annotator_id - annotation_time: Approximate time when annotations/judgments/labels was rendered by human. If this is missing, it means that the time was not recorded, which often happens when the author made the annotation. """ __slots__ = [ 'annotator_id', 'annotation_time', ] thrift_spec = ( None, # 0 (1, TType.STRING, 'annotator_id', None, None, ), # 1 (2, TType.STRUCT, 'annotation_time', (StreamTime, StreamTime.thrift_spec), None, ), # 2 ) def __init__(self, annotator_id=None, annotation_time=None,): self.annotator_id = annotator_id self.annotation_time = annotation_time def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRING: self.annotator_id = iprot.readString(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRUCT: self.annotation_time = StreamTime() self.annotation_time.read(iprot) else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Annotator') if self.annotator_id is not None: oprot.writeFieldBegin('annotator_id', TType.STRING, 1) oprot.writeString(self.annotator_id) oprot.writeFieldEnd() if self.annotation_time is not None: oprot.writeFieldBegin('annotation_time', TType.STRUCT, 2) self.annotation_time.write(oprot) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Offset(object): """ Offset specifies a range within a field of data in this ContentItem Attributes: - type: see comments on OffsetType - first: actual offset, which could be measured in bytes, chars, or lines. The data element identified by 'first' is included, and that identified by first+length is also included. In set notation, [first:first+length-1] or equivalently [first:first+length) or in list slicing, like python's: [first:first+length] While thrift treats these as signed integers, negative values are meaningless in this context, i.e. we do not end wrap. N.B. When this is an xpath offset, `length` is always `0` and `first` is always the first xpath offset in correspondence with the `xpath` member. - length - xpath: If this is an xpath offset, then this is set to the xpath address of the start text node. The relative start character offset is in `first`. - content_form: name of the data element inside a ContentItem to which this label applies, e.g. 'raw' 'clean_html' or 'clean_visible'. Defaults to clean_visible, which is the most common case. - value: bytes specified by this offset extracted from the original; just to assist in debugging - xpath_end: If this is an xpath range, then this is set to the xpath address of the end text node. The relative end character offset is in `xpath2_offset`. Note that `xpath` and `first` have the same relationship as `xpath_end` and `xpath_end_offset`. - xpath_end_offset: If this is an xpath offset, then this is set to the ending xpath's relative char offset. (`first` contains the start offset.) Note that this offset participates in the half-open interval: [(xpath, first), (xpath_end, xpath_end_offset)). """ __slots__ = [ 'type', 'first', 'length', 'xpath', 'content_form', 'value', 'xpath_end', 'xpath_end_offset', ] thrift_spec = ( None, # 0 (1, TType.I32, 'type', None, None, ), # 1 (2, TType.I64, 'first', None, None, ), # 2 (3, TType.I32, 'length', None, None, ), # 3 (4, TType.STRING, 'xpath', None, None, ), # 4 (5, TType.STRING, 'content_form', None, "clean_visible", ), # 5 (6, TType.STRING, 'value', None, None, ), # 6 (7, TType.STRING, 'xpath_end', None, None, ), # 7 (8, TType.I64, 'xpath_end_offset', None, None, ), # 8 ) def __init__(self, type=None, first=None, length=None, xpath=None, content_form=thrift_spec[5][4], value=None, xpath_end=None, xpath_end_offset=None,): self.type = type self.first = first self.length = length self.xpath = xpath self.content_form = content_form self.value = value self.xpath_end = xpath_end self.xpath_end_offset = xpath_end_offset def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.I32: self.type = iprot.readI32(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.I64: self.first = iprot.readI64(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.I32: self.length = iprot.readI32(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.STRING: self.xpath = iprot.readString(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.STRING: self.content_form = iprot.readString(); else: iprot.skip(ftype) elif fid == 6: if ftype == TType.STRING: self.value = iprot.readString(); else: iprot.skip(ftype) elif fid == 7: if ftype == TType.STRING: self.xpath_end = iprot.readString(); else: iprot.skip(ftype) elif fid == 8: if ftype == TType.I64: self.xpath_end_offset = iprot.readI64(); else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Offset') if self.type is not None: oprot.writeFieldBegin('type', TType.I32, 1) oprot.writeI32(self.type) oprot.writeFieldEnd() if self.first is not None: oprot.writeFieldBegin('first', TType.I64, 2) oprot.writeI64(self.first) oprot.writeFieldEnd() if self.length is not None: oprot.writeFieldBegin('length', TType.I32, 3) oprot.writeI32(self.length) oprot.writeFieldEnd() if self.xpath is not None: oprot.writeFieldBegin('xpath', TType.STRING, 4) oprot.writeString(self.xpath) oprot.writeFieldEnd() if self.content_form is not None: oprot.writeFieldBegin('content_form', TType.STRING, 5) oprot.writeString(self.content_form) oprot.writeFieldEnd() if self.value is not None: oprot.writeFieldBegin('value', TType.STRING, 6) oprot.writeString(self.value) oprot.writeFieldEnd() if self.xpath_end is not None: oprot.writeFieldBegin('xpath_end', TType.STRING, 7) oprot.writeString(self.xpath_end) oprot.writeFieldEnd() if self.xpath_end_offset is not None: oprot.writeFieldBegin('xpath_end_offset', TType.I64, 8) oprot.writeI64(self.xpath_end_offset) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Target(object): """ Targets are "informationt targets," such as entities or topics, usually from a knowledge base, such as Wikipedia. Attributes: - target_id: unique string identifier, usually a URL into Wikipedia, Freebase, or some other structured reference system for info targets. - kb_id: kb_id is usually redundant if the target_id is a full URL, e.g. en.wikipedia.org - kb_snapshot_time: moment in history that the target_kb was accessed """ __slots__ = [ 'target_id', 'kb_id', 'kb_snapshot_time', ] thrift_spec = ( None, # 0 (1, TType.STRING, 'target_id', None, None, ), # 1 (2, TType.STRING, 'kb_id', None, None, ), # 2 (3, TType.STRUCT, 'kb_snapshot_time', (StreamTime, StreamTime.thrift_spec), None, ), # 3 ) def __init__(self, target_id=None, kb_id=None, kb_snapshot_time=None,): self.target_id = target_id self.kb_id = kb_id self.kb_snapshot_time = kb_snapshot_time def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRING: self.target_id = iprot.readString(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.kb_id = iprot.readString(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.STRUCT: self.kb_snapshot_time = StreamTime() self.kb_snapshot_time.read(iprot) else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Target') if self.target_id is not None: oprot.writeFieldBegin('target_id', TType.STRING, 1) oprot.writeString(self.target_id) oprot.writeFieldEnd() if self.kb_id is not None: oprot.writeFieldBegin('kb_id', TType.STRING, 2) oprot.writeString(self.kb_id) oprot.writeFieldEnd() if self.kb_snapshot_time is not None: oprot.writeFieldBegin('kb_snapshot_time', TType.STRUCT, 3) self.kb_snapshot_time.write(oprot) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Label(object): """ Labels are human generated assertions about a portion of a document For example, a human author might label their own text by inserting hyperlinks to Wikipedia, or a NIST assessor might record which tokens in a text mention a target entity. Label instances can be attached in three palces: - Token.labels list - Sentence.labels list - ContentItem.labels map Attributes: - annotator: identifies the source of this Label - target: identifies the information need assessed by annotator - offsets: pointers to data to which this label applies. If empty, then label applies to the entire Token, Sentence, or ContentItem to which it is attached. - positive: Labels are usually positive assertions that the token mentions the target_id. It is sometimes useful to collect negative assertions that a token is NOT the target_id, which can be indicated by setting Label.positive to False - comments: Save notes from Annotator about this Rating - mentions: Record strings that are "mentions" of the target in this text. Note: there used to be a field 'contains mention' which would allow for a document to be labeled as about a thing without containing a string naming the thing. That hardly ever actually happened, but maybe someday it could be added back if needed. - relevance: numerical score assigned by annotator to "judge" or "rate" the utility of this StreamItem to addressing the target information need. The range and interpretation of relevance numbers depends on the annotator. relevance can represent a rank ordering or an enumeration such as -1=Garbage, 0=Neutral, 1=Useful, 2=Vital - stream_id: Stream ID for this label. This is the stream_id for the source StreamItem, if a label is stored independently from its original data. - flags: General purpose flags. These flags can be used to mark documents as meeting an extensible set of criteria. """ __slots__ = [ 'annotator', 'target', 'offsets', 'positive', 'comments', 'mentions', 'relevance', 'stream_id', 'flags', ] thrift_spec = ( None, # 0 (1, TType.STRUCT, 'annotator', (Annotator, Annotator.thrift_spec), None, ), # 1 (2, TType.STRUCT, 'target', (Target, Target.thrift_spec), None, ), # 2 (3, TType.MAP, 'offsets', (TType.I32,None,TType.STRUCT,(Offset, Offset.thrift_spec)), { }, ), # 3 (4, TType.BOOL, 'positive', None, True, ), # 4 (5, TType.STRING, 'comments', None, None, ), # 5 (6, TType.LIST, 'mentions', (TType.STRING,None), None, ), # 6 (7, TType.I16, 'relevance', None, None, ), # 7 (8, TType.STRING, 'stream_id', None, None, ), # 8 (9, TType.LIST, 'flags', (TType.I32,None), None, ), # 9 ) def __init__(self, annotator=None, target=None, offsets=thrift_spec[3][4], positive=thrift_spec[4][4], comments=None, mentions=None, relevance=None, stream_id=None, flags=None,): self.annotator = annotator self.target = target if offsets is self.thrift_spec[3][4]: offsets = { } self.offsets = offsets self.positive = positive self.comments = comments self.mentions = mentions self.relevance = relevance self.stream_id = stream_id self.flags = flags def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRUCT: self.annotator = Annotator() self.annotator.read(iprot) else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRUCT: self.target = Target() self.target.read(iprot) else: iprot.skip(ftype) elif fid == 3: if ftype == TType.MAP: self.offsets = {} (_ktype1, _vtype2, _size0 ) = iprot.readMapBegin() for _i4 in xrange(_size0): _key5 = iprot.readI32(); _val6 = Offset() _val6.read(iprot) self.offsets[_key5] = _val6 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 4: if ftype == TType.BOOL: self.positive = iprot.readBool(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.STRING: self.comments = iprot.readString(); else: iprot.skip(ftype) elif fid == 6: if ftype == TType.LIST: self.mentions = [] (_etype10, _size7) = iprot.readListBegin() for _i11 in xrange(_size7): _elem12 = iprot.readString(); self.mentions.append(_elem12) iprot.readListEnd() else: iprot.skip(ftype) elif fid == 7: if ftype == TType.I16: self.relevance = iprot.readI16(); else: iprot.skip(ftype) elif fid == 8: if ftype == TType.STRING: self.stream_id = iprot.readString(); else: iprot.skip(ftype) elif fid == 9: if ftype == TType.LIST: self.flags = [] (_etype16, _size13) = iprot.readListBegin() for _i17 in xrange(_size13): _elem18 = iprot.readI32(); self.flags.append(_elem18) iprot.readListEnd() else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Label') if self.annotator is not None: oprot.writeFieldBegin('annotator', TType.STRUCT, 1) self.annotator.write(oprot) oprot.writeFieldEnd() if self.target is not None: oprot.writeFieldBegin('target', TType.STRUCT, 2) self.target.write(oprot) oprot.writeFieldEnd() if self.offsets is not None: oprot.writeFieldBegin('offsets', TType.MAP, 3) oprot.writeMapBegin(TType.I32, TType.STRUCT, len(self.offsets)) for kiter19,viter20 in self.offsets.items(): oprot.writeI32(kiter19) viter20.write(oprot) oprot.writeMapEnd() oprot.writeFieldEnd() if self.positive is not None: oprot.writeFieldBegin('positive', TType.BOOL, 4) oprot.writeBool(self.positive) oprot.writeFieldEnd() if self.comments is not None: oprot.writeFieldBegin('comments', TType.STRING, 5) oprot.writeString(self.comments) oprot.writeFieldEnd() if self.mentions is not None: oprot.writeFieldBegin('mentions', TType.LIST, 6) oprot.writeListBegin(TType.STRING, len(self.mentions)) for iter21 in self.mentions: oprot.writeString(iter21) oprot.writeListEnd() oprot.writeFieldEnd() if self.relevance is not None: oprot.writeFieldBegin('relevance', TType.I16, 7) oprot.writeI16(self.relevance) oprot.writeFieldEnd() if self.stream_id is not None: oprot.writeFieldBegin('stream_id', TType.STRING, 8) oprot.writeString(self.stream_id) oprot.writeFieldEnd() if self.flags is not None: oprot.writeFieldBegin('flags', TType.LIST, 9) oprot.writeListBegin(TType.I32, len(self.flags)) for iter22 in self.flags: oprot.writeI32(iter22) oprot.writeListEnd() oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Attribute(object): """ Description of an attribute of an entity discovered by a tagger in the text. Attributes: - attribute_type: The type of the attribute, see documentation for AttributeType - evidence: UTF-8 string that tagger asserts as evidence of an attribute - value: A normalized, strongly typed value derived from the evidence. The actual type must be determined by programmatically interpretint the attribute_type. For example, attribute_type==AttributeType.PER_GENDER implies that this value will be a string containing an integer index into the Gender enum. For attribute_type that imply a value of type date-time, the value is a zulu_timestamp string from a StreamTime instance. - sentence_id: Zero-based index into the sentences array for this TaggerID - mention_id: Index into the mentions in the document. This identifies the mention to which the attrribute applies """ __slots__ = [ 'attribute_type', 'evidence', 'value', 'sentence_id', 'mention_id', ] thrift_spec = ( None, # 0 (1, TType.I32, 'attribute_type', None, None, ), # 1 (2, TType.STRING, 'evidence', None, None, ), # 2 (3, TType.STRING, 'value', None, None, ), # 3 (4, TType.I32, 'sentence_id', None, None, ), # 4 (5, TType.I32, 'mention_id', None, None, ), # 5 ) def __init__(self, attribute_type=None, evidence=None, value=None, sentence_id=None, mention_id=None,): self.attribute_type = attribute_type self.evidence = evidence self.value = value self.sentence_id = sentence_id self.mention_id = mention_id def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.I32: self.attribute_type = iprot.readI32(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.evidence = iprot.readString(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.STRING: self.value = iprot.readString(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.I32: self.sentence_id = iprot.readI32(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.I32: self.mention_id = iprot.readI32(); else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Attribute') if self.attribute_type is not None: oprot.writeFieldBegin('attribute_type', TType.I32, 1) oprot.writeI32(self.attribute_type) oprot.writeFieldEnd() if self.evidence is not None: oprot.writeFieldBegin('evidence', TType.STRING, 2) oprot.writeString(self.evidence) oprot.writeFieldEnd() if self.value is not None: oprot.writeFieldBegin('value', TType.STRING, 3) oprot.writeString(self.value) oprot.writeFieldEnd() if self.sentence_id is not None: oprot.writeFieldBegin('sentence_id', TType.I32, 4) oprot.writeI32(self.sentence_id) oprot.writeFieldEnd() if self.mention_id is not None: oprot.writeFieldBegin('mention_id', TType.I32, 5) oprot.writeI32(self.mention_id) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Token(object): """ Textual tokens identified by an NLP pipeline and marked up with metadata from automatic taggers and possibly also Labels from humans. Attributes: - token_num: zero-based index into the stream of tokens from a document - token: actual token string, must always be a UTF8 encoded string, not a unicode string, because thrift stores them as 8-bit. - offsets: offsets into the original data (see Offset.content_form) - sentence_pos: zero-based index into the sentence, which is used for dependency parsed data - lemma: lemmatization of the token, again must be UTF8 - pos: part of speech labels defined by Penn TreeBank: http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html Should probably convert this to an enum, analogous to EntityType - entity_type: entity type from named entity recognizer (classifier) - mention_id: Identifier for a each mention in this TaggerID's description of the document. Is unique at the document level. Serves two purposes: 1) Distinguishing multi-token mention. Needed when the entity_type and equiv_id do not change between tokens that are part of separate mentions, e.g. "The senator is known to his friends as David, Davy, Zeus, and Mr. Elephant." 2) Refering to mentions used in Relation objects. - equiv_id: Within-doc coref chain ID. That is, identifier of equivalence class of co-referent tokens. Default is -1, meaning None. - parent_id: parent sentence_pos in dependency parse. Default is -1, ie None - dependency_path: grammatical relation label on path to parent in dependency parse, defined by whatever tagger was used -- should pick a canonical definition here and convert it to an enum. - labels: Labels attached to this token, defaults to an empty map - mention_type: Identify the type of mention, e.g. pronoun, description, proper name - custom_entity_type: CUSTOM entity type from named entity recognizer (classifier). If used, then entity_type should be set to EntityType.CUSTOM_TYPE, i.e. 17. This is useful when a specialized tagger has a large number of unique entity types, such as entity:artefact:weapon:blunt Rather than expand EntityType with many more subtypes, we can escape the protection of the enum and just use a string here: """ __slots__ = [ 'token_num', 'token', 'offsets', 'sentence_pos', 'lemma', 'pos', 'entity_type', 'mention_id', 'equiv_id', 'parent_id', 'dependency_path', 'labels', 'mention_type', 'custom_entity_type', ] thrift_spec = ( None, # 0 (1, TType.I32, 'token_num', None, None, ), # 1 (2, TType.STRING, 'token', None, None, ), # 2 (3, TType.MAP, 'offsets', (TType.I32,None,TType.STRUCT,(Offset, Offset.thrift_spec)), { }, ), # 3 (4, TType.I32, 'sentence_pos', None, -1, ), # 4 (5, TType.STRING, 'lemma', None, None, ), # 5 (6, TType.STRING, 'pos', None, None, ), # 6 (7, TType.I32, 'entity_type', None, None, ), # 7 (8, TType.I32, 'mention_id', None, -1, ), # 8 (9, TType.I32, 'equiv_id', None, -1, ), # 9 (10, TType.I32, 'parent_id', None, -1, ), # 10 (11, TType.STRING, 'dependency_path', None, None, ), # 11 (12, TType.MAP, 'labels', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Label, Label.thrift_spec))), { }, ), # 12 (13, TType.I32, 'mention_type', None, None, ), # 13 (14, TType.STRING, 'custom_entity_type', None, None, ), # 14 ) def __init__(self, token_num=None, token=None, offsets=thrift_spec[3][4], sentence_pos=thrift_spec[4][4], lemma=None, pos=None, entity_type=None, mention_id=thrift_spec[8][4], equiv_id=thrift_spec[9][4], parent_id=thrift_spec[10][4], dependency_path=None, labels=thrift_spec[12][4], mention_type=None, custom_entity_type=None,): self.token_num = token_num self.token = token if offsets is self.thrift_spec[3][4]: offsets = { } self.offsets = offsets self.sentence_pos = sentence_pos self.lemma = lemma self.pos = pos self.entity_type = entity_type if mention_id is self.thrift_spec[8][4]: mention_id = -1 self.mention_id = mention_id self.equiv_id = equiv_id self.parent_id = parent_id self.dependency_path = dependency_path if labels is self.thrift_spec[12][4]: labels = { } self.labels = labels self.mention_type = mention_type self.custom_entity_type = custom_entity_type def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.I32: self.token_num = iprot.readI32(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.token = iprot.readString(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.MAP: self.offsets = {} (_ktype24, _vtype25, _size23 ) = iprot.readMapBegin() for _i27 in xrange(_size23): _key28 = iprot.readI32(); _val29 = Offset() _val29.read(iprot) self.offsets[_key28] = _val29 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 4: if ftype == TType.I32: self.sentence_pos = iprot.readI32(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.STRING: self.lemma = iprot.readString(); else: iprot.skip(ftype) elif fid == 6: if ftype == TType.STRING: self.pos = iprot.readString(); else: iprot.skip(ftype) elif fid == 7: if ftype == TType.I32: self.entity_type = iprot.readI32(); else: iprot.skip(ftype) elif fid == 8: if ftype == TType.I32: self.mention_id = iprot.readI32(); else: iprot.skip(ftype) elif fid == 9: if ftype == TType.I32: self.equiv_id = iprot.readI32(); else: iprot.skip(ftype) elif fid == 10: if ftype == TType.I32: self.parent_id = iprot.readI32(); else: iprot.skip(ftype) elif fid == 11: if ftype == TType.STRING: self.dependency_path = iprot.readString(); else: iprot.skip(ftype) elif fid == 12: if ftype == TType.MAP: self.labels = {} (_ktype31, _vtype32, _size30 ) = iprot.readMapBegin() for _i34 in xrange(_size30): _key35 = iprot.readString(); _val36 = [] (_etype40, _size37) = iprot.readListBegin() for _i41 in xrange(_size37): _elem42 = Label() _elem42.read(iprot) _val36.append(_elem42) iprot.readListEnd() self.labels[_key35] = _val36 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 13: if ftype == TType.I32: self.mention_type = iprot.readI32(); else: iprot.skip(ftype) elif fid == 14: if ftype == TType.STRING: self.custom_entity_type = iprot.readString(); else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Token') if self.token_num is not None: oprot.writeFieldBegin('token_num', TType.I32, 1) oprot.writeI32(self.token_num) oprot.writeFieldEnd() if self.token is not None: oprot.writeFieldBegin('token', TType.STRING, 2) oprot.writeString(self.token) oprot.writeFieldEnd() if self.offsets is not None: oprot.writeFieldBegin('offsets', TType.MAP, 3) oprot.writeMapBegin(TType.I32, TType.STRUCT, len(self.offsets)) for kiter43,viter44 in self.offsets.items(): oprot.writeI32(kiter43) viter44.write(oprot) oprot.writeMapEnd() oprot.writeFieldEnd() if self.sentence_pos is not None: oprot.writeFieldBegin('sentence_pos', TType.I32, 4) oprot.writeI32(self.sentence_pos) oprot.writeFieldEnd() if self.lemma is not None: oprot.writeFieldBegin('lemma', TType.STRING, 5) oprot.writeString(self.lemma) oprot.writeFieldEnd() if self.pos is not None: oprot.writeFieldBegin('pos', TType.STRING, 6) oprot.writeString(self.pos) oprot.writeFieldEnd() if self.entity_type is not None: oprot.writeFieldBegin('entity_type', TType.I32, 7) oprot.writeI32(self.entity_type) oprot.writeFieldEnd() if self.mention_id is not None: oprot.writeFieldBegin('mention_id', TType.I32, 8) oprot.writeI32(self.mention_id) oprot.writeFieldEnd() if self.equiv_id is not None: oprot.writeFieldBegin('equiv_id', TType.I32, 9) oprot.writeI32(self.equiv_id) oprot.writeFieldEnd() if self.parent_id is not None: oprot.writeFieldBegin('parent_id', TType.I32, 10) oprot.writeI32(self.parent_id) oprot.writeFieldEnd() if self.dependency_path is not None: oprot.writeFieldBegin('dependency_path', TType.STRING, 11) oprot.writeString(self.dependency_path) oprot.writeFieldEnd() if self.labels is not None: oprot.writeFieldBegin('labels', TType.MAP, 12) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.labels)) for kiter45,viter46 in self.labels.items(): oprot.writeString(kiter45) oprot.writeListBegin(TType.STRUCT, len(viter46)) for iter47 in viter46: iter47.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.mention_type is not None: oprot.writeFieldBegin('mention_type', TType.I32, 13) oprot.writeI32(self.mention_type) oprot.writeFieldEnd() if self.custom_entity_type is not None: oprot.writeFieldBegin('custom_entity_type', TType.STRING, 14) oprot.writeString(self.custom_entity_type) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Sentence(object): """ Attributes: - tokens: tokens in this sentence - labels: array of instances of Label attached to this sentence, defaults to an empty map """ __slots__ = [ 'tokens', 'labels', ] thrift_spec = ( None, # 0 (1, TType.LIST, 'tokens', (TType.STRUCT,(Token, Token.thrift_spec)), [ ], ), # 1 (2, TType.MAP, 'labels', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Label, Label.thrift_spec))), { }, ), # 2 ) def __init__(self, tokens=thrift_spec[1][4], labels=thrift_spec[2][4],): if tokens is self.thrift_spec[1][4]: tokens = [ ] self.tokens = tokens if labels is self.thrift_spec[2][4]: labels = { } self.labels = labels def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.LIST: self.tokens = [] (_etype51, _size48) = iprot.readListBegin() for _i52 in xrange(_size48): _elem53 = Token() _elem53.read(iprot) self.tokens.append(_elem53) iprot.readListEnd() else: iprot.skip(ftype) elif fid == 2: if ftype == TType.MAP: self.labels = {} (_ktype55, _vtype56, _size54 ) = iprot.readMapBegin() for _i58 in xrange(_size54): _key59 = iprot.readString(); _val60 = [] (_etype64, _size61) = iprot.readListBegin() for _i65 in xrange(_size61): _elem66 = Label() _elem66.read(iprot) _val60.append(_elem66) iprot.readListEnd() self.labels[_key59] = _val60 iprot.readMapEnd() else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Sentence') if self.tokens is not None: oprot.writeFieldBegin('tokens', TType.LIST, 1) oprot.writeListBegin(TType.STRUCT, len(self.tokens)) for iter67 in self.tokens: iter67.write(oprot) oprot.writeListEnd() oprot.writeFieldEnd() if self.labels is not None: oprot.writeFieldBegin('labels', TType.MAP, 2) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.labels)) for kiter68,viter69 in self.labels.items(): oprot.writeString(kiter68) oprot.writeListBegin(TType.STRUCT, len(viter69)) for iter70 in viter69: iter70.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Tagging(object): """ Attributes: - tagger_id - raw_tagging: raw output of the tagging tool - tagger_config: short human-readable description of configuration parameters - tagger_version: short human-readable version string of the tagging tool - generation_time: time that tagging was generated """ __slots__ = [ 'tagger_id', 'raw_tagging', 'tagger_config', 'tagger_version', 'generation_time', ] thrift_spec = ( None, # 0 (1, TType.STRING, 'tagger_id', None, None, ), # 1 (2, TType.STRING, 'raw_tagging', None, None, ), # 2 (3, TType.STRING, 'tagger_config', None, None, ), # 3 (4, TType.STRING, 'tagger_version', None, None, ), # 4 (5, TType.STRUCT, 'generation_time', (StreamTime, StreamTime.thrift_spec), None, ), # 5 ) def __init__(self, tagger_id=None, raw_tagging=None, tagger_config=None, tagger_version=None, generation_time=None,): self.tagger_id = tagger_id self.raw_tagging = raw_tagging self.tagger_config = tagger_config self.tagger_version = tagger_version self.generation_time = generation_time def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRING: self.tagger_id = iprot.readString(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.raw_tagging = iprot.readString(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.STRING: self.tagger_config = iprot.readString(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.STRING: self.tagger_version = iprot.readString(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.STRUCT: self.generation_time = StreamTime() self.generation_time.read(iprot) else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Tagging') if self.tagger_id is not None: oprot.writeFieldBegin('tagger_id', TType.STRING, 1) oprot.writeString(self.tagger_id) oprot.writeFieldEnd() if self.raw_tagging is not None: oprot.writeFieldBegin('raw_tagging', TType.STRING, 2) oprot.writeString(self.raw_tagging) oprot.writeFieldEnd() if self.tagger_config is not None: oprot.writeFieldBegin('tagger_config', TType.STRING, 3) oprot.writeString(self.tagger_config) oprot.writeFieldEnd() if self.tagger_version is not None: oprot.writeFieldBegin('tagger_version', TType.STRING, 4) oprot.writeString(self.tagger_version) oprot.writeFieldEnd() if self.generation_time is not None: oprot.writeFieldBegin('generation_time', TType.STRUCT, 5) self.generation_time.write(oprot) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Selector(object): """ Desription of a selector discovered by an extractor in the text. Attributes: - selector_type: what type of selector this is - raw_selector: the selector string as it appears in the document - canonical_selector: the selector string in a canonical form - offsets: pointer to the selector string within the clean_visible document """ __slots__ = [ 'selector_type', 'raw_selector', 'canonical_selector', 'offsets', ] thrift_spec = ( None, # 0 (1, TType.STRING, 'selector_type', None, None, ), # 1 (2, TType.STRING, 'raw_selector', None, None, ), # 2 (3, TType.STRING, 'canonical_selector', None, None, ), # 3 (4, TType.MAP, 'offsets', (TType.I32,None,TType.STRUCT,(Offset, Offset.thrift_spec)), { }, ), # 4 ) def __init__(self, selector_type=None, raw_selector=None, canonical_selector=None, offsets=thrift_spec[4][4],): self.selector_type = selector_type self.raw_selector = raw_selector self.canonical_selector = canonical_selector if offsets is self.thrift_spec[4][4]: offsets = { } self.offsets = offsets def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRING: self.selector_type = iprot.readString(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.raw_selector = iprot.readString(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.STRING: self.canonical_selector = iprot.readString(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.MAP: self.offsets = {} (_ktype72, _vtype73, _size71 ) = iprot.readMapBegin() for _i75 in xrange(_size71): _key76 = iprot.readI32(); _val77 = Offset() _val77.read(iprot) self.offsets[_key76] = _val77 iprot.readMapEnd() else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Selector') if self.selector_type is not None: oprot.writeFieldBegin('selector_type', TType.STRING, 1) oprot.writeString(self.selector_type) oprot.writeFieldEnd() if self.raw_selector is not None: oprot.writeFieldBegin('raw_selector', TType.STRING, 2) oprot.writeString(self.raw_selector) oprot.writeFieldEnd() if self.canonical_selector is not None: oprot.writeFieldBegin('canonical_selector', TType.STRING, 3) oprot.writeString(self.canonical_selector) oprot.writeFieldEnd() if self.offsets is not None: oprot.writeFieldBegin('offsets', TType.MAP, 4) oprot.writeMapBegin(TType.I32, TType.STRUCT, len(self.offsets)) for kiter78,viter79 in self.offsets.items(): oprot.writeI32(kiter78) viter79.write(oprot) oprot.writeMapEnd() oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Zone(object): """ Desription of a Zone discovered by an extractor in the text. Attributes: - zone_type: what type of zone this is - offsets: For a given OffsetType provide a *list* of Offset objects """ __slots__ = [ 'zone_type', 'offsets', ] thrift_spec = ( None, # 0 (1, TType.I32, 'zone_type', None, None, ), # 1 (2, TType.MAP, 'offsets', (TType.I32,None,TType.LIST,(TType.STRUCT,(Offset, Offset.thrift_spec))), { }, ), # 2 ) def __init__(self, zone_type=None, offsets=thrift_spec[2][4],): self.zone_type = zone_type if offsets is self.thrift_spec[2][4]: offsets = { } self.offsets = offsets def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.I32: self.zone_type = iprot.readI32(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.MAP: self.offsets = {} (_ktype81, _vtype82, _size80 ) = iprot.readMapBegin() for _i84 in xrange(_size80): _key85 = iprot.readI32(); _val86 = [] (_etype90, _size87) = iprot.readListBegin() for _i91 in xrange(_size87): _elem92 = Offset() _elem92.read(iprot) _val86.append(_elem92) iprot.readListEnd() self.offsets[_key85] = _val86 iprot.readMapEnd() else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Zone') if self.zone_type is not None: oprot.writeFieldBegin('zone_type', TType.I32, 1) oprot.writeI32(self.zone_type) oprot.writeFieldEnd() if self.offsets is not None: oprot.writeFieldBegin('offsets', TType.MAP, 2) oprot.writeMapBegin(TType.I32, TType.LIST, len(self.offsets)) for kiter93,viter94 in self.offsets.items(): oprot.writeI32(kiter93) oprot.writeListBegin(TType.STRUCT, len(viter94)) for iter95 in viter94: iter95.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Relation(object): """ Description of a relation between two entities that a tagger discovered in the text. Attributes: - relation_type: The type of the relation, see documentation for RelationType - sentence_id_1: Zero-based index into the sentences array for this TaggerID - mention_id_1: Index into the mentions in the document. This identifies the origin of the relation. For example, the relation (Bob, PHYS_Located, Chicago) would have mention_id_1 point to Bob. - sentence_id_2: Zero-based index into the sentences array for this TaggerID - mention_id_2: Index into the mentions in the document. This identifies the origin of the relation. For example, the relation (Bob, PHYS_Located, Chicago) would have mention_id_2 point to Chicago. """ __slots__ = [ 'relation_type', 'sentence_id_1', 'mention_id_1', 'sentence_id_2', 'mention_id_2', ] thrift_spec = ( None, # 0 (1, TType.I32, 'relation_type', None, None, ), # 1 (2, TType.I32, 'sentence_id_1', None, None, ), # 2 (3, TType.I32, 'mention_id_1', None, None, ), # 3 (4, TType.I32, 'sentence_id_2', None, None, ), # 4 (5, TType.I32, 'mention_id_2', None, None, ), # 5 ) def __init__(self, relation_type=None, sentence_id_1=None, mention_id_1=None, sentence_id_2=None, mention_id_2=None,): self.relation_type = relation_type self.sentence_id_1 = sentence_id_1 self.mention_id_1 = mention_id_1 self.sentence_id_2 = sentence_id_2 self.mention_id_2 = mention_id_2 def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.I32: self.relation_type = iprot.readI32(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.I32: self.sentence_id_1 = iprot.readI32(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.I32: self.mention_id_1 = iprot.readI32(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.I32: self.sentence_id_2 = iprot.readI32(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.I32: self.mention_id_2 = iprot.readI32(); else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Relation') if self.relation_type is not None: oprot.writeFieldBegin('relation_type', TType.I32, 1) oprot.writeI32(self.relation_type) oprot.writeFieldEnd() if self.sentence_id_1 is not None: oprot.writeFieldBegin('sentence_id_1', TType.I32, 2) oprot.writeI32(self.sentence_id_1) oprot.writeFieldEnd() if self.mention_id_1 is not None: oprot.writeFieldBegin('mention_id_1', TType.I32, 3) oprot.writeI32(self.mention_id_1) oprot.writeFieldEnd() if self.sentence_id_2 is not None: oprot.writeFieldBegin('sentence_id_2', TType.I32, 4) oprot.writeI32(self.sentence_id_2) oprot.writeFieldEnd() if self.mention_id_2 is not None: oprot.writeFieldBegin('mention_id_2', TType.I32, 5) oprot.writeI32(self.mention_id_2) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Language(object): """ Description of a natural language used in text Attributes: - code: two letter code for the language - name """ __slots__ = [ 'code', 'name', ] thrift_spec = ( None, # 0 (1, TType.STRING, 'code', None, None, ), # 1 (2, TType.STRING, 'name', None, None, ), # 2 ) def __init__(self, code=None, name=None,): self.code = code self.name = name def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRING: self.code = iprot.readString(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.name = iprot.readString(); else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Language') if self.code is not None: oprot.writeFieldBegin('code', TType.STRING, 1) oprot.writeString(self.code) oprot.writeFieldEnd() if self.name is not None: oprot.writeFieldBegin('name', TType.STRING, 2) oprot.writeString(self.name) oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class ContentItem(object): """ ContentItem contains raw data, an indication of its character encoding, and various transformed versions of the raw data. Attributes: - raw: original download, raw byte array - encoding: guessed from raw and/or headers, e.g. by python-requests.org - media_type: Content-type header from fetching the data, or MIME type - clean_html: HTML-formatted version of raw with UTF8 encoding and no broken tags. All HTML-escaped characters are converted to their UTF8 equivalents. < > & are escaped. - clean_visible: All tags stripped from clean_html and replaced with whitespace, so they have the same byte offsets. The only escaped characters are < > &, so that this can be treated as Character Data in XML: http://www.w3.org/TR/xml/#syntax Again: must be UTF8 - logs: Logs generated from processing pipeline, for forensics - taggings: A set of auto-generated taggings, such as a One-Word-Per-Line (OWLP) tokenization and sentence chunking with part-of-speech, lemmatization, and NER classification. The string name should be the same as the tagger_id and also corresponds to the key in sentences or sentence_blobs, which get generated by transforming a Tagging.raw_tagging into Sentence and Token instances Taggings are generated from 'clean_visible' so offsets (byte, char, line) refer to clean_visible and clean_html -- not raw. - labels: sets of annotations - sentences: parsed Sentence objects generated by an NLP pipeline identified by the string name, which is a tagger_id that connects this Sentences instance to the Tagging struct from which it came - sentence_blobs: same as 'sentences' except the array of Sentence instances are serialized into a binary string that can be read by the Thrift's binary protocol. This allows lazy deserialization via an iterator -- one sentence at a time. This might be totally unnecessary, because at least some of the Thrift language implementations have lazy object construction, e.g. --gen py:dynamic,slots - language: indication of which natural language is used in the text - relations: List of relations discovered in clean_visible - attributes: List of attributes discovered in clean_visible - external_ids: Map of external identifier strings to mention_ids generated by a particular tagger. This allows external systems to associate record IDs with individual mentions, or sets of mentions. - selectors: Map of external identifier strings to selectors in clean_visible - zones: Map of external identifier strings to Zones in clean_visible """ __slots__ = [ 'raw', 'encoding', 'media_type', 'clean_html', 'clean_visible', 'logs', 'taggings', 'labels', 'sentences', 'sentence_blobs', 'language', 'relations', 'attributes', 'external_ids', 'selectors', 'zones', ] thrift_spec = ( None, # 0 (1, TType.STRING, 'raw', None, None, ), # 1 (2, TType.STRING, 'encoding', None, None, ), # 2 (3, TType.STRING, 'media_type', None, None, ), # 3 (4, TType.STRING, 'clean_html', None, None, ), # 4 (5, TType.STRING, 'clean_visible', None, None, ), # 5 (6, TType.LIST, 'logs', (TType.STRING,None), [ ], ), # 6 (7, TType.MAP, 'taggings', (TType.STRING,None,TType.STRUCT,(Tagging, Tagging.thrift_spec)), { }, ), # 7 (8, TType.MAP, 'labels', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Label, Label.thrift_spec))), { }, ), # 8 (9, TType.MAP, 'sentences', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Sentence, Sentence.thrift_spec))), { }, ), # 9 (10, TType.MAP, 'sentence_blobs', (TType.STRING,None,TType.STRING,None), { }, ), # 10 (11, TType.STRUCT, 'language', (Language, Language.thrift_spec), None, ), # 11 (12, TType.MAP, 'relations', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Relation, Relation.thrift_spec))), { }, ), # 12 (13, TType.MAP, 'attributes', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Attribute, Attribute.thrift_spec))), { }, ), # 13 (14, TType.MAP, 'external_ids', (TType.STRING,None,TType.MAP,(TType.I32,None,TType.STRING,None)), { }, ), # 14 (15, TType.MAP, 'selectors', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Selector, Selector.thrift_spec))), { }, ), # 15 (16, TType.MAP, 'zones', (TType.STRING,None,TType.MAP,(TType.I32,None,TType.STRUCT,(Zone, Zone.thrift_spec))), { }, ), # 16 ) def __init__(self, raw=None, encoding=None, media_type=None, clean_html=None, clean_visible=None, logs=thrift_spec[6][4], taggings=thrift_spec[7][4], labels=thrift_spec[8][4], sentences=thrift_spec[9][4], sentence_blobs=thrift_spec[10][4], language=None, relations=thrift_spec[12][4], attributes=thrift_spec[13][4], external_ids=thrift_spec[14][4], selectors=thrift_spec[15][4], zones=thrift_spec[16][4],): self.raw = raw self.encoding = encoding self.media_type = media_type self.clean_html = clean_html self.clean_visible = clean_visible if logs is self.thrift_spec[6][4]: logs = [ ] self.logs = logs if taggings is self.thrift_spec[7][4]: taggings = { } self.taggings = taggings if labels is self.thrift_spec[8][4]: labels = { } self.labels = labels if sentences is self.thrift_spec[9][4]: sentences = { } self.sentences = sentences if sentence_blobs is self.thrift_spec[10][4]: sentence_blobs = { } self.sentence_blobs = sentence_blobs self.language = language if relations is self.thrift_spec[12][4]: relations = { } self.relations = relations if attributes is self.thrift_spec[13][4]: attributes = { } self.attributes = attributes if external_ids is self.thrift_spec[14][4]: external_ids = { } self.external_ids = external_ids if selectors is self.thrift_spec[15][4]: selectors = { } self.selectors = selectors if zones is self.thrift_spec[16][4]: zones = { } self.zones = zones def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRING: self.raw = iprot.readString(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.encoding = iprot.readString(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.STRING: self.media_type = iprot.readString(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.STRING: self.clean_html = iprot.readString(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.STRING: self.clean_visible = iprot.readString(); else: iprot.skip(ftype) elif fid == 6: if ftype == TType.LIST: self.logs = [] (_etype99, _size96) = iprot.readListBegin() for _i100 in xrange(_size96): _elem101 = iprot.readString(); self.logs.append(_elem101) iprot.readListEnd() else: iprot.skip(ftype) elif fid == 7: if ftype == TType.MAP: self.taggings = {} (_ktype103, _vtype104, _size102 ) = iprot.readMapBegin() for _i106 in xrange(_size102): _key107 = iprot.readString(); _val108 = Tagging() _val108.read(iprot) self.taggings[_key107] = _val108 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 8: if ftype == TType.MAP: self.labels = {} (_ktype110, _vtype111, _size109 ) = iprot.readMapBegin() for _i113 in xrange(_size109): _key114 = iprot.readString(); _val115 = [] (_etype119, _size116) = iprot.readListBegin() for _i120 in xrange(_size116): _elem121 = Label() _elem121.read(iprot) _val115.append(_elem121) iprot.readListEnd() self.labels[_key114] = _val115 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 9: if ftype == TType.MAP: self.sentences = {} (_ktype123, _vtype124, _size122 ) = iprot.readMapBegin() for _i126 in xrange(_size122): _key127 = iprot.readString(); _val128 = [] (_etype132, _size129) = iprot.readListBegin() for _i133 in xrange(_size129): _elem134 = Sentence() _elem134.read(iprot) _val128.append(_elem134) iprot.readListEnd() self.sentences[_key127] = _val128 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 10: if ftype == TType.MAP: self.sentence_blobs = {} (_ktype136, _vtype137, _size135 ) = iprot.readMapBegin() for _i139 in xrange(_size135): _key140 = iprot.readString(); _val141 = iprot.readString(); self.sentence_blobs[_key140] = _val141 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 11: if ftype == TType.STRUCT: self.language = Language() self.language.read(iprot) else: iprot.skip(ftype) elif fid == 12: if ftype == TType.MAP: self.relations = {} (_ktype143, _vtype144, _size142 ) = iprot.readMapBegin() for _i146 in xrange(_size142): _key147 = iprot.readString(); _val148 = [] (_etype152, _size149) = iprot.readListBegin() for _i153 in xrange(_size149): _elem154 = Relation() _elem154.read(iprot) _val148.append(_elem154) iprot.readListEnd() self.relations[_key147] = _val148 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 13: if ftype == TType.MAP: self.attributes = {} (_ktype156, _vtype157, _size155 ) = iprot.readMapBegin() for _i159 in xrange(_size155): _key160 = iprot.readString(); _val161 = [] (_etype165, _size162) = iprot.readListBegin() for _i166 in xrange(_size162): _elem167 = Attribute() _elem167.read(iprot) _val161.append(_elem167) iprot.readListEnd() self.attributes[_key160] = _val161 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 14: if ftype == TType.MAP: self.external_ids = {} (_ktype169, _vtype170, _size168 ) = iprot.readMapBegin() for _i172 in xrange(_size168): _key173 = iprot.readString(); _val174 = {} (_ktype176, _vtype177, _size175 ) = iprot.readMapBegin() for _i179 in xrange(_size175): _key180 = iprot.readI32(); _val181 = iprot.readString(); _val174[_key180] = _val181 iprot.readMapEnd() self.external_ids[_key173] = _val174 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 15: if ftype == TType.MAP: self.selectors = {} (_ktype183, _vtype184, _size182 ) = iprot.readMapBegin() for _i186 in xrange(_size182): _key187 = iprot.readString(); _val188 = [] (_etype192, _size189) = iprot.readListBegin() for _i193 in xrange(_size189): _elem194 = Selector() _elem194.read(iprot) _val188.append(_elem194) iprot.readListEnd() self.selectors[_key187] = _val188 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 16: if ftype == TType.MAP: self.zones = {} (_ktype196, _vtype197, _size195 ) = iprot.readMapBegin() for _i199 in xrange(_size195): _key200 = iprot.readString(); _val201 = {} (_ktype203, _vtype204, _size202 ) = iprot.readMapBegin() for _i206 in xrange(_size202): _key207 = iprot.readI32(); _val208 = Zone() _val208.read(iprot) _val201[_key207] = _val208 iprot.readMapEnd() self.zones[_key200] = _val201 iprot.readMapEnd() else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('ContentItem') if self.raw is not None: oprot.writeFieldBegin('raw', TType.STRING, 1) oprot.writeString(self.raw) oprot.writeFieldEnd() if self.encoding is not None: oprot.writeFieldBegin('encoding', TType.STRING, 2) oprot.writeString(self.encoding) oprot.writeFieldEnd() if self.media_type is not None: oprot.writeFieldBegin('media_type', TType.STRING, 3) oprot.writeString(self.media_type) oprot.writeFieldEnd() if self.clean_html is not None: oprot.writeFieldBegin('clean_html', TType.STRING, 4) oprot.writeString(self.clean_html) oprot.writeFieldEnd() if self.clean_visible is not None: oprot.writeFieldBegin('clean_visible', TType.STRING, 5) oprot.writeString(self.clean_visible) oprot.writeFieldEnd() if self.logs is not None: oprot.writeFieldBegin('logs', TType.LIST, 6) oprot.writeListBegin(TType.STRING, len(self.logs)) for iter209 in self.logs: oprot.writeString(iter209) oprot.writeListEnd() oprot.writeFieldEnd() if self.taggings is not None: oprot.writeFieldBegin('taggings', TType.MAP, 7) oprot.writeMapBegin(TType.STRING, TType.STRUCT, len(self.taggings)) for kiter210,viter211 in self.taggings.items(): oprot.writeString(kiter210) viter211.write(oprot) oprot.writeMapEnd() oprot.writeFieldEnd() if self.labels is not None: oprot.writeFieldBegin('labels', TType.MAP, 8) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.labels)) for kiter212,viter213 in self.labels.items(): oprot.writeString(kiter212) oprot.writeListBegin(TType.STRUCT, len(viter213)) for iter214 in viter213: iter214.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.sentences is not None: oprot.writeFieldBegin('sentences', TType.MAP, 9) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.sentences)) for kiter215,viter216 in self.sentences.items(): oprot.writeString(kiter215) oprot.writeListBegin(TType.STRUCT, len(viter216)) for iter217 in viter216: iter217.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.sentence_blobs is not None: oprot.writeFieldBegin('sentence_blobs', TType.MAP, 10) oprot.writeMapBegin(TType.STRING, TType.STRING, len(self.sentence_blobs)) for kiter218,viter219 in self.sentence_blobs.items(): oprot.writeString(kiter218) oprot.writeString(viter219) oprot.writeMapEnd() oprot.writeFieldEnd() if self.language is not None: oprot.writeFieldBegin('language', TType.STRUCT, 11) self.language.write(oprot) oprot.writeFieldEnd() if self.relations is not None: oprot.writeFieldBegin('relations', TType.MAP, 12) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.relations)) for kiter220,viter221 in self.relations.items(): oprot.writeString(kiter220) oprot.writeListBegin(TType.STRUCT, len(viter221)) for iter222 in viter221: iter222.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.attributes is not None: oprot.writeFieldBegin('attributes', TType.MAP, 13) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.attributes)) for kiter223,viter224 in self.attributes.items(): oprot.writeString(kiter223) oprot.writeListBegin(TType.STRUCT, len(viter224)) for iter225 in viter224: iter225.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.external_ids is not None: oprot.writeFieldBegin('external_ids', TType.MAP, 14) oprot.writeMapBegin(TType.STRING, TType.MAP, len(self.external_ids)) for kiter226,viter227 in self.external_ids.items(): oprot.writeString(kiter226) oprot.writeMapBegin(TType.I32, TType.STRING, len(viter227)) for kiter228,viter229 in viter227.items(): oprot.writeI32(kiter228) oprot.writeString(viter229) oprot.writeMapEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.selectors is not None: oprot.writeFieldBegin('selectors', TType.MAP, 15) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.selectors)) for kiter230,viter231 in self.selectors.items(): oprot.writeString(kiter230) oprot.writeListBegin(TType.STRUCT, len(viter231)) for iter232 in viter231: iter232.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.zones is not None: oprot.writeFieldBegin('zones', TType.MAP, 16) oprot.writeMapBegin(TType.STRING, TType.MAP, len(self.zones)) for kiter233,viter234 in self.zones.items(): oprot.writeString(kiter233) oprot.writeMapBegin(TType.I32, TType.STRUCT, len(viter234)) for kiter235,viter236 in viter234.items(): oprot.writeI32(kiter235) viter236.write(oprot) oprot.writeMapEnd() oprot.writeMapEnd() oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class Rating(object): """ Ratings are buman generated assertions about a entire document's utility for a particular topic or entity in a reference KB. Attributes: - annotator: identifies the source of this Rating - target: identifies the information need assessed by annotator - relevance: numerical score assigned by annotator to "judge" or "rate" the utility of this StreamItem to addressing the target information need. The range and interpretation of relevance numbers depends on the annotator. relevance can represent a rank ordering or an enumeration such as -1=Garbage, 0=Neutral, 1=Useful, 2=Vital - contains_mention: true|false indication of whether the document mentions the target entity. This is only partially correlated with relevance. For example, a document might mention the entity only in chrome text on the side such that it is a Garbage-rated text for that entity. - comments: Save notes from Annotator about this Rating - mentions: Record strings that are "mentions" of the target in this text - flags: General purpose flags. These flags can be used to mark documents as meeting an extensible set of criteria. """ __slots__ = [ 'annotator', 'target', 'relevance', 'contains_mention', 'comments', 'mentions', 'flags', ] thrift_spec = ( None, # 0 (1, TType.STRUCT, 'annotator', (Annotator, Annotator.thrift_spec), None, ), # 1 (2, TType.STRUCT, 'target', (Target, Target.thrift_spec), None, ), # 2 (3, TType.I16, 'relevance', None, None, ), # 3 (4, TType.BOOL, 'contains_mention', None, None, ), # 4 (5, TType.STRING, 'comments', None, None, ), # 5 (6, TType.LIST, 'mentions', (TType.STRING,None), None, ), # 6 (7, TType.LIST, 'flags', (TType.I32,None), None, ), # 7 ) def __init__(self, annotator=None, target=None, relevance=None, contains_mention=None, comments=None, mentions=None, flags=None,): self.annotator = annotator self.target = target self.relevance = relevance self.contains_mention = contains_mention self.comments = comments self.mentions = mentions self.flags = flags def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.STRUCT: self.annotator = Annotator() self.annotator.read(iprot) else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRUCT: self.target = Target() self.target.read(iprot) else: iprot.skip(ftype) elif fid == 3: if ftype == TType.I16: self.relevance = iprot.readI16(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.BOOL: self.contains_mention = iprot.readBool(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.STRING: self.comments = iprot.readString(); else: iprot.skip(ftype) elif fid == 6: if ftype == TType.LIST: self.mentions = [] (_etype240, _size237) = iprot.readListBegin() for _i241 in xrange(_size237): _elem242 = iprot.readString(); self.mentions.append(_elem242) iprot.readListEnd() else: iprot.skip(ftype) elif fid == 7: if ftype == TType.LIST: self.flags = [] (_etype246, _size243) = iprot.readListBegin() for _i247 in xrange(_size243): _elem248 = iprot.readI32(); self.flags.append(_elem248) iprot.readListEnd() else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('Rating') if self.annotator is not None: oprot.writeFieldBegin('annotator', TType.STRUCT, 1) self.annotator.write(oprot) oprot.writeFieldEnd() if self.target is not None: oprot.writeFieldBegin('target', TType.STRUCT, 2) self.target.write(oprot) oprot.writeFieldEnd() if self.relevance is not None: oprot.writeFieldBegin('relevance', TType.I16, 3) oprot.writeI16(self.relevance) oprot.writeFieldEnd() if self.contains_mention is not None: oprot.writeFieldBegin('contains_mention', TType.BOOL, 4) oprot.writeBool(self.contains_mention) oprot.writeFieldEnd() if self.comments is not None: oprot.writeFieldBegin('comments', TType.STRING, 5) oprot.writeString(self.comments) oprot.writeFieldEnd() if self.mentions is not None: oprot.writeFieldBegin('mentions', TType.LIST, 6) oprot.writeListBegin(TType.STRING, len(self.mentions)) for iter249 in self.mentions: oprot.writeString(iter249) oprot.writeListEnd() oprot.writeFieldEnd() if self.flags is not None: oprot.writeFieldBegin('flags', TType.LIST, 7) oprot.writeListBegin(TType.I32, len(self.flags)) for iter250 in self.flags: oprot.writeI32(iter250) oprot.writeListEnd() oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other) class StreamItem(object): """ This is the primary interface to the corpus data. It is called StreamItem rather than CorpusItem and has a required StreamTime attribute, because even for a static corpus, each document was captured at a particular time in Earth history and might have been different if captured earlier or later. All corpora are stream corpora, even if they were not explicitly created as such. stream_id is the unique identifier for documents in the corpus. This is similar to the StreamItem defined in kba.thrift for TREC KBA 2012, however it removes the 'title' and 'anchor' fields, which can now be represented in other_content. This means that code that was written to read messages from kba.thrift must be updated. Attributes: - version: must provide a version number here - doc_id: md5 hash of the abs_url - abs_url: normalized form of the original_url, should be a valid URL - schost: scheme://hostname parsed from abs_url - original_url: string obtain from some source. Only present if not a valid URL, in which case abs_url was derived from original_url - source: string uniquely identifying this data set, should start with a year string, such as 'news' or 'social' - body: primary content - source_metadata: see above for explanation of the values that can appear in this dictionary of metadata info from the source. The string keys in this map should be short, descriptive, and free of whitespace. - stream_id: stream_id is actual unique identifier for a StreamItem. The format is: stream_id = '%d-%s' % (int(stream_time.epoch_ticks), doc_id) - stream_time: earliest time that this content was known to exist. Usually, body.raw was also saved at the time of that first observation. - other_content: such as title, anchor, extracted, etc. When present, 'anchor', is a single anchor text of a URL pointing to this doc. Note that this does not have metadata like the URL of the page that contained this anchor. Such general link graph data may eventually motivate an extension to this thrift interface. - ratings: doc-level judgments relating entire StreamItem to a Target - external_ids: doc-level map connecting either doc_id or stream_id (or both) to external identifiers. This allows external systems to associate record IDs with individual doc_id or stream_id of this document. The keys in the second level map can be either doc_id or stream_id, or possibly other IDs in the future. """ __slots__ = [ 'version', 'doc_id', 'abs_url', 'schost', 'original_url', 'source', 'body', 'source_metadata', 'stream_id', 'stream_time', 'other_content', 'ratings', 'external_ids', ] thrift_spec = ( None, # 0 (1, TType.I32, 'version', None, 1, ), # 1 (2, TType.STRING, 'doc_id', None, None, ), # 2 (3, TType.STRING, 'abs_url', None, None, ), # 3 (4, TType.STRING, 'schost', None, None, ), # 4 (5, TType.STRING, 'original_url', None, None, ), # 5 (6, TType.STRING, 'source', None, None, ), # 6 (7, TType.STRUCT, 'body', (ContentItem, ContentItem.thrift_spec), None, ), # 7 (8, TType.MAP, 'source_metadata', (TType.STRING,None,TType.STRING,None), { }, ), # 8 (9, TType.STRING, 'stream_id', None, None, ), # 9 (10, TType.STRUCT, 'stream_time', (StreamTime, StreamTime.thrift_spec), None, ), # 10 (11, TType.MAP, 'other_content', (TType.STRING,None,TType.STRUCT,(ContentItem, ContentItem.thrift_spec)), { }, ), # 11 (12, TType.MAP, 'ratings', (TType.STRING,None,TType.LIST,(TType.STRUCT,(Rating, Rating.thrift_spec))), { }, ), # 12 None, # 13 (14, TType.MAP, 'external_ids', (TType.STRING,None,TType.MAP,(TType.STRING,None,TType.STRING,None)), { }, ), # 14 ) def __init__(self, version=thrift_spec[1][4], doc_id=None, abs_url=None, schost=None, original_url=None, source=None, body=None, source_metadata=thrift_spec[8][4], stream_id=None, stream_time=None, other_content=thrift_spec[11][4], ratings=thrift_spec[12][4], external_ids=thrift_spec[14][4],): self.version = version self.doc_id = doc_id self.abs_url = abs_url self.schost = schost self.original_url = original_url self.source = source self.body = body if source_metadata is self.thrift_spec[8][4]: source_metadata = { } self.source_metadata = source_metadata self.stream_id = stream_id self.stream_time = stream_time if other_content is self.thrift_spec[11][4]: other_content = { } self.other_content = other_content if ratings is self.thrift_spec[12][4]: ratings = { } self.ratings = ratings if external_ids is self.thrift_spec[14][4]: external_ids = { } self.external_ids = external_ids def read(self, iprot): if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None: fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec)) return iprot.readStructBegin() while True: (fname, ftype, fid) = iprot.readFieldBegin() if ftype == TType.STOP: break if fid == 1: if ftype == TType.I32: self.version = iprot.readI32(); else: iprot.skip(ftype) elif fid == 2: if ftype == TType.STRING: self.doc_id = iprot.readString(); else: iprot.skip(ftype) elif fid == 3: if ftype == TType.STRING: self.abs_url = iprot.readString(); else: iprot.skip(ftype) elif fid == 4: if ftype == TType.STRING: self.schost = iprot.readString(); else: iprot.skip(ftype) elif fid == 5: if ftype == TType.STRING: self.original_url = iprot.readString(); else: iprot.skip(ftype) elif fid == 6: if ftype == TType.STRING: self.source = iprot.readString(); else: iprot.skip(ftype) elif fid == 7: if ftype == TType.STRUCT: self.body = ContentItem() self.body.read(iprot) else: iprot.skip(ftype) elif fid == 8: if ftype == TType.MAP: self.source_metadata = {} (_ktype252, _vtype253, _size251 ) = iprot.readMapBegin() for _i255 in xrange(_size251): _key256 = iprot.readString(); _val257 = iprot.readString(); self.source_metadata[_key256] = _val257 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 9: if ftype == TType.STRING: self.stream_id = iprot.readString(); else: iprot.skip(ftype) elif fid == 10: if ftype == TType.STRUCT: self.stream_time = StreamTime() self.stream_time.read(iprot) else: iprot.skip(ftype) elif fid == 11: if ftype == TType.MAP: self.other_content = {} (_ktype259, _vtype260, _size258 ) = iprot.readMapBegin() for _i262 in xrange(_size258): _key263 = iprot.readString(); _val264 = ContentItem() _val264.read(iprot) self.other_content[_key263] = _val264 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 12: if ftype == TType.MAP: self.ratings = {} (_ktype266, _vtype267, _size265 ) = iprot.readMapBegin() for _i269 in xrange(_size265): _key270 = iprot.readString(); _val271 = [] (_etype275, _size272) = iprot.readListBegin() for _i276 in xrange(_size272): _elem277 = Rating() _elem277.read(iprot) _val271.append(_elem277) iprot.readListEnd() self.ratings[_key270] = _val271 iprot.readMapEnd() else: iprot.skip(ftype) elif fid == 14: if ftype == TType.MAP: self.external_ids = {} (_ktype279, _vtype280, _size278 ) = iprot.readMapBegin() for _i282 in xrange(_size278): _key283 = iprot.readString(); _val284 = {} (_ktype286, _vtype287, _size285 ) = iprot.readMapBegin() for _i289 in xrange(_size285): _key290 = iprot.readString(); _val291 = iprot.readString(); _val284[_key290] = _val291 iprot.readMapEnd() self.external_ids[_key283] = _val284 iprot.readMapEnd() else: iprot.skip(ftype) else: iprot.skip(ftype) iprot.readFieldEnd() iprot.readStructEnd() def write(self, oprot): if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None: oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec))) return oprot.writeStructBegin('StreamItem') if self.version is not None: oprot.writeFieldBegin('version', TType.I32, 1) oprot.writeI32(self.version) oprot.writeFieldEnd() if self.doc_id is not None: oprot.writeFieldBegin('doc_id', TType.STRING, 2) oprot.writeString(self.doc_id) oprot.writeFieldEnd() if self.abs_url is not None: oprot.writeFieldBegin('abs_url', TType.STRING, 3) oprot.writeString(self.abs_url) oprot.writeFieldEnd() if self.schost is not None: oprot.writeFieldBegin('schost', TType.STRING, 4) oprot.writeString(self.schost) oprot.writeFieldEnd() if self.original_url is not None: oprot.writeFieldBegin('original_url', TType.STRING, 5) oprot.writeString(self.original_url) oprot.writeFieldEnd() if self.source is not None: oprot.writeFieldBegin('source', TType.STRING, 6) oprot.writeString(self.source) oprot.writeFieldEnd() if self.body is not None: oprot.writeFieldBegin('body', TType.STRUCT, 7) self.body.write(oprot) oprot.writeFieldEnd() if self.source_metadata is not None: oprot.writeFieldBegin('source_metadata', TType.MAP, 8) oprot.writeMapBegin(TType.STRING, TType.STRING, len(self.source_metadata)) for kiter292,viter293 in self.source_metadata.items(): oprot.writeString(kiter292) oprot.writeString(viter293) oprot.writeMapEnd() oprot.writeFieldEnd() if self.stream_id is not None: oprot.writeFieldBegin('stream_id', TType.STRING, 9) oprot.writeString(self.stream_id) oprot.writeFieldEnd() if self.stream_time is not None: oprot.writeFieldBegin('stream_time', TType.STRUCT, 10) self.stream_time.write(oprot) oprot.writeFieldEnd() if self.other_content is not None: oprot.writeFieldBegin('other_content', TType.MAP, 11) oprot.writeMapBegin(TType.STRING, TType.STRUCT, len(self.other_content)) for kiter294,viter295 in self.other_content.items(): oprot.writeString(kiter294) viter295.write(oprot) oprot.writeMapEnd() oprot.writeFieldEnd() if self.ratings is not None: oprot.writeFieldBegin('ratings', TType.MAP, 12) oprot.writeMapBegin(TType.STRING, TType.LIST, len(self.ratings)) for kiter296,viter297 in self.ratings.items(): oprot.writeString(kiter296) oprot.writeListBegin(TType.STRUCT, len(viter297)) for iter298 in viter297: iter298.write(oprot) oprot.writeListEnd() oprot.writeMapEnd() oprot.writeFieldEnd() if self.external_ids is not None: oprot.writeFieldBegin('external_ids', TType.MAP, 14) oprot.writeMapBegin(TType.STRING, TType.MAP, len(self.external_ids)) for kiter299,viter300 in self.external_ids.items(): oprot.writeString(kiter299) oprot.writeMapBegin(TType.STRING, TType.STRING, len(viter300)) for kiter301,viter302 in viter300.items(): oprot.writeString(kiter301) oprot.writeString(viter302) oprot.writeMapEnd() oprot.writeMapEnd() oprot.writeFieldEnd() oprot.writeFieldStop() oprot.writeStructEnd() def validate(self): return def __repr__(self): L = ['%s=%r' % (key, getattr(self, key)) for key in self.__slots__] return '%s(%s)' % (self.__class__.__name__, ', '.join(L)) def __eq__(self, other): if not isinstance(other, self.__class__): return False for attr in self.__slots__: my_val = getattr(self, attr) other_val = getattr(other, attr) if my_val != other_val: return False return True def __ne__(self, other): return not (self == other)