From 40ff412896fc3b5077803c759beaf92f3e8970cb Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Mon, 10 Jun 2019 22:35:09 +0100
Subject: [PATCH] minor model training logging improvement (#116)

* minor model training logging improvement

* make port configurable

* added autocut-start-cloud
---
 Makefile                                             | 12 +++++++++++-
 .../models/text/crf/autocut_training_pipeline.py     |  9 +++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index e8fbb2a..05ab7a7 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@ DOCKER_COMPOSE = $(DOCKER_COMPOSE_DEV)
 
 
 PYTEST_ARGS =
+PORT = 8080
 
 
 .PHONY: all build
@@ -56,7 +57,16 @@ autocut-start: .require-AUTOCUT_MODEL_PATH build
 	$(DOCKER_COMPOSE) run --rm \
 	-v "$(AUTOCUT_MODEL_PATH):/tmp/model.pkl" \
 	-e "AUTOCUT_MODEL_PATH=/tmp/model.pkl" \
-	-p 8080:8080 \
+	-p $(PORT):8080 \
+	sciencebeam-gym \
+	start-autocut.sh
+
+
+autocut-start-cloud: .require-AUTOCUT_MODEL_PATH build
+	$(DOCKER_COMPOSE) run --rm \
+	-v $$HOME/.config/gcloud:/root/.config/gcloud \
+	-e "AUTOCUT_MODEL_PATH=$(AUTOCUT_MODEL_PATH)" \
+	-p $(PORT):8080 \
 	sciencebeam-gym \
 	start-autocut.sh
 
diff --git a/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py b/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py
index b75948b..caf902e 100644
--- a/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py
+++ b/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py
@@ -113,14 +113,15 @@ def run(opt):
         opt.input_file_list, opt.input_file_column, opt.input_xpath, opt.limit,
         opt.namespaces
     )
+    LOGGER.info('loaded %s input values (e.g. %s)', len(input_values), input_values[:10])
     target_values = _load_values(
         opt.target_file_list, opt.target_file_column, opt.target_xpath, opt.limit,
         opt.namespaces
     )
-    save_model(
-        opt.output_path,
-        train_model(input_values, target_values)
-    )
+    LOGGER.info('loaded %s target values (e.g. %s)', len(target_values), target_values[:10])
+    serialized_model = train_model(input_values, target_values)
+    LOGGER.info('model size: {:,} bytes'.format(len(serialized_model)))
+    save_model(opt.output_path, serialized_model)
 
 
 def main(argv=None):
-- 
GitLab