diff --git a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py index 39f4a768f752fcf8eafad1292083a773ba6aafda..dd3a9ef9ac1e328503d04314fe68b8b611790069 100644 --- a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py +++ b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py @@ -2,6 +2,8 @@ import logging import argparse import pickle +from six import raise_from + from sciencebeam_gym.utils.file_list_loader import ( load_file_list ) @@ -48,11 +50,18 @@ def parse_args(argv=None): return parser.parse_args(argv) +def load_and_convert_to_token_props(filename): + try: + structured_document = load_structured_document(filename) + return list(structured_document_to_token_props( + structured_document + )) + except StandardError as e: + raise_from(RuntimeError('failed to process %s' % filename), e) + def train_model(file_list): token_props_list_by_document = [ - list(structured_document_to_token_props( - load_structured_document(filename) - )) + load_and_convert_to_token_props(filename) for filename in file_list ] X = [token_props_list_to_features(x) for x in token_props_list_by_document]