diff --git a/Makefile b/Makefile index e8fbb2a36c83a284046f98f457e69c9cf0124841..05ab7a70d50394368a5e2807177edb94d489d5c0 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ DOCKER_COMPOSE = $(DOCKER_COMPOSE_DEV) PYTEST_ARGS = +PORT = 8080 .PHONY: all build @@ -56,7 +57,16 @@ autocut-start: .require-AUTOCUT_MODEL_PATH build $(DOCKER_COMPOSE) run --rm \ -v "$(AUTOCUT_MODEL_PATH):/tmp/model.pkl" \ -e "AUTOCUT_MODEL_PATH=/tmp/model.pkl" \ - -p 8080:8080 \ + -p $(PORT):8080 \ + sciencebeam-gym \ + start-autocut.sh + + +autocut-start-cloud: .require-AUTOCUT_MODEL_PATH build + $(DOCKER_COMPOSE) run --rm \ + -v $$HOME/.config/gcloud:/root/.config/gcloud \ + -e "AUTOCUT_MODEL_PATH=$(AUTOCUT_MODEL_PATH)" \ + -p $(PORT):8080 \ sciencebeam-gym \ start-autocut.sh diff --git a/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py b/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py index b75948b2dfadee27f0d817b4740975a3364ecff2..caf902e930c2f28541e2d150d8f744b241c5b3d6 100644 --- a/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py +++ b/sciencebeam_gym/models/text/crf/autocut_training_pipeline.py @@ -113,14 +113,15 @@ def run(opt): opt.input_file_list, opt.input_file_column, opt.input_xpath, opt.limit, opt.namespaces ) + LOGGER.info('loaded %s input values (e.g. %s)', len(input_values), input_values[:10]) target_values = _load_values( opt.target_file_list, opt.target_file_column, opt.target_xpath, opt.limit, opt.namespaces ) - save_model( - opt.output_path, - train_model(input_values, target_values) - ) + LOGGER.info('loaded %s target values (e.g. %s)', len(target_values), target_values[:10]) + serialized_model = train_model(input_values, target_values) + LOGGER.info('model size: {:,} bytes'.format(len(serialized_model))) + save_model(opt.output_path, serialized_model) def main(argv=None):