From 83506e518e3e1156edd01657510d19875c0740c6 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Wed, 10 Jan 2018 18:45:42 +0000
Subject: [PATCH] added multi threading when loading files

---
 .../models/text/crf/crfsuite_training_pipeline.py     | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py
index 41d8b9d..7f4cb32 100644
--- a/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py
+++ b/sciencebeam_gym/models/text/crf/crfsuite_training_pipeline.py
@@ -2,6 +2,7 @@ import logging
 import argparse
 import pickle
 from functools import partial
+from concurrent.futures import ThreadPoolExecutor
 
 from six import raise_from
 
@@ -117,18 +118,22 @@ def train_model(file_list, cv_file_list, page_range=None, progress=True):
   token_props_list_by_document = []
   total = len(file_list)
   with tqdm(total=total, leave=False, desc='loading files', disable=not progress) as pbar:
-    for filename, cv_filename in zip(file_list, cv_file_list):
-      token_props_list_by_document.append(
+    with ThreadPoolExecutor(max_workers=50) as executor:
+      process_fn = lambda (filename, cv_filename): (
         load_and_convert_to_token_props(filename, cv_filename, page_range=page_range)
       )
-      pbar.update(1)
+      for result in executor.map(process_fn, zip(file_list, cv_file_list)):
+        token_props_list_by_document.append(result)
+        pbar.update(1)
   X = [token_props_list_to_features(x) for x in token_props_list_by_document]
   y = [token_props_list_to_labels(x) for x in token_props_list_by_document]
   model = CrfSuiteModel()
+  get_logger().info('training model (with %d documents)', len(X))
   model.fit(X, y)
   return serialize_model(model)
 
 def save_model(output_filename, model_bytes):
+  get_logger().info('saving model to %s', output_filename)
   save_file_content(output_filename, model_bytes)
 
 def run(opt):
-- 
GitLab