Skip to content
Snippets Groups Projects
Commit 83506e51 authored by Daniel Ecer's avatar Daniel Ecer
Browse files

added multi threading when loading files

parent b15a4eec
No related branches found
No related tags found
No related merge requests found
...@@ -2,6 +2,7 @@ import logging ...@@ -2,6 +2,7 @@ import logging
import argparse import argparse
import pickle import pickle
from functools import partial from functools import partial
from concurrent.futures import ThreadPoolExecutor
from six import raise_from from six import raise_from
...@@ -117,18 +118,22 @@ def train_model(file_list, cv_file_list, page_range=None, progress=True): ...@@ -117,18 +118,22 @@ def train_model(file_list, cv_file_list, page_range=None, progress=True):
token_props_list_by_document = [] token_props_list_by_document = []
total = len(file_list) total = len(file_list)
with tqdm(total=total, leave=False, desc='loading files', disable=not progress) as pbar: with tqdm(total=total, leave=False, desc='loading files', disable=not progress) as pbar:
for filename, cv_filename in zip(file_list, cv_file_list): with ThreadPoolExecutor(max_workers=50) as executor:
token_props_list_by_document.append( process_fn = lambda (filename, cv_filename): (
load_and_convert_to_token_props(filename, cv_filename, page_range=page_range) load_and_convert_to_token_props(filename, cv_filename, page_range=page_range)
) )
pbar.update(1) for result in executor.map(process_fn, zip(file_list, cv_file_list)):
token_props_list_by_document.append(result)
pbar.update(1)
X = [token_props_list_to_features(x) for x in token_props_list_by_document] X = [token_props_list_to_features(x) for x in token_props_list_by_document]
y = [token_props_list_to_labels(x) for x in token_props_list_by_document] y = [token_props_list_to_labels(x) for x in token_props_list_by_document]
model = CrfSuiteModel() model = CrfSuiteModel()
get_logger().info('training model (with %d documents)', len(X))
model.fit(X, y) model.fit(X, y)
return serialize_model(model) return serialize_model(model)
def save_model(output_filename, model_bytes): def save_model(output_filename, model_bytes):
get_logger().info('saving model to %s', output_filename)
save_file_content(output_filename, model_bytes) save_file_content(output_filename, model_bytes)
def run(opt): def run(opt):
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment