| from inputs.fields.field import Field |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class RawTokenField(Field): |
| """This Class preserves raw text of tokens |
| """ |
| def __init__(self, namespace, source_key): |
| """This function sets namesapce of field, dataset source key |
| |
| Arguments: |
| namespace {str} -- namesapce of field |
| source_key {str} -- indicate key in text data |
| """ |
|
|
| super().__init__() |
| self.namespace = str(namespace) |
| self.source_key = str(source_key) |
|
|
| def count_vocab_items(self, counter, sentences): |
| """ `RawTokenField` doesn't update counter |
| |
| Arguments: |
| counter {dict} -- counter |
| sentences {list} -- text content after preprocessing |
| """ |
|
|
| pass |
|
|
| def index(self, instance, vocab, sentences): |
| """This function doesn't use vocabulary, |
| perserve raw text of sentences(tokens) |
| |
| Arguments: |
| instance {dict} -- numerical represenration of text data |
| vocab {Vocabulary} -- vocabulary |
| sentences {list} -- text content after preprocessing |
| """ |
|
|
| for sentence in sentences: |
| instance[self.namespace].append([token for token in sentence[self.source_key]]) |
|
|
| logger.info("Index sentences {} to construct instance namespace {} successfully.".format( |
| self.source_key, self.namespace)) |
|
|