From 4bb5ccdf4cda167ce85511e83d74d7dacd32ac19 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Fri, 23 Aug 2019 16:34:37 +0100
Subject: [PATCH] train segmentation model (#9)

* train and upload segmentation model

* keep grobid-home on data volume

* fixed train-segmentation-model-with-dataset executable
---
 Makefile                       |  33 +++++++++++
 scripts/train-header-model.sh  |  73 +----------------------
 scripts/train-model.sh         | 105 +++++++++++++++++++++++++++++++++
 scripts/upload-header-model.sh |  14 +----
 scripts/upload-model.sh        |  32 ++++++++++
 5 files changed, 174 insertions(+), 83 deletions(-)
 create mode 100755 scripts/train-model.sh
 create mode 100755 scripts/upload-model.sh

diff --git a/Makefile b/Makefile
index d592193..34ca53e 100644
--- a/Makefile
+++ b/Makefile
@@ -83,6 +83,39 @@ upload-header-model: build
 	$(RUN) upload-header-model.sh "$(CLOUD_MODELS_PATH)"
 
 
+copy-raw-segmentation-training-data-to-tei: build
+	$(RUN) bash -c '\
+		mkdir -p "$(DATASET_DIR)/segmentation/corpus/tei" && \
+		cp "$(DATASET_DIR)/segmentation/corpus/tei-raw/"*.xml "$(DATASET_DIR)/segmentation/corpus/tei/" \
+		'
+
+
+train-segmentation-model-with-dataset: build
+	$(RUN) train-model.sh \
+		--dataset "$(DATASET_DIR)" \
+		--model segmentation \
+		$(TRAIN_ARGS)
+
+
+train-segmentation-model-with-default-dataset: build
+	$(RUN) train-model.sh \
+		--use-default-dataset \
+		--model segmentation \
+		$(TRAIN_ARGS)
+
+
+train-segmentation-model-with-dataset-and-default-dataset: build
+	$(RUN) train-model.sh \
+		--dataset "$(DATASET_DIR)" \
+		--use-default-dataset \
+		--model segmentation \
+		$(TRAIN_ARGS)
+
+
+upload-segmentation-model: build
+	$(RUN) upload-model.sh "$(CLOUD_MODELS_PATH)" "segmentation"
+
+
 shell: build
 	$(RUN) bash
 
diff --git a/scripts/train-header-model.sh b/scripts/train-header-model.sh
index 5429971..36fca99 100755
--- a/scripts/train-header-model.sh
+++ b/scripts/train-header-model.sh
@@ -2,75 +2,6 @@
 
 set -e
 
-SOURCE_DATASET_DIR="/opt/grobid-source/grobid-trainer/resources/dataset"
-TRAIN_DATASET_DIR="/opt/grobid/resources/dataset"
-CLOUD_MODELS_PATH="${CLOUD_MODELS_PATH}"
+SCRIPT_HOME="$(dirname "$0")"
 
-DATASETS=()
-
-echo "args: $@"
-
-POSITIONAL=()
-while [[ $# -gt 0 ]]; do
-    key="$1"
-    case $key in
-        --use-default-dataset)
-        DATASETS+=("$SOURCE_DATASET_DIR")
-        shift # past argument
-        ;;
-
-        --dataset)
-        DATASETS+=("$2")
-        shift # past argument
-        shift # past value
-        ;;
-
-        --cloud-models-path)
-        CLOUD_MODELS_PATH="$2"
-        shift # past argument
-        shift # past value
-        ;;
-
-        *)    # unknown option
-        POSITIONAL+=("$1")
-        shift # past argument
-        ;;
-    esac
-done
-set -- "${POSITIONAL[@]}" # restore positional parameters
-
-if [ -z "${DATASETS}" ]; then
-    echo "Error: no datasets enabled"
-    exit 1
-fi
-
-echo "DATASETS=${DATASETS[@]}"
-
-rm -rf "${TRAIN_DATASET_DIR}/header"
-mkdir -p "${TRAIN_DATASET_DIR}/header"
-cp -ar "${SOURCE_DATASET_DIR}/header/crfpp-templates" "$TRAIN_DATASET_DIR/header/crfpp-templates"
-
-for dataset in ${DATASETS[@]}; do
-    echo "dataset=$dataset"
-    mkdir -p "${TRAIN_DATASET_DIR}/header/corpus/"
-    gsutil -m cp -r "${dataset}/header/corpus/headers" "${TRAIN_DATASET_DIR}/header/corpus/"
-    gsutil -m cp -r "${dataset}/header/corpus/tei" "${TRAIN_DATASET_DIR}/header/corpus/"
-    gunzip -f "${TRAIN_DATASET_DIR}/header/corpus/headers/"*.gz || true
-    gunzip -f "${TRAIN_DATASET_DIR}/header/corpus/tei/"*.gz || true
-done
-
-ls -l --recursive "${TRAIN_DATASET_DIR}/header"
-
-if [ ! -d "/opt/grobid/grobid-home" ]; then
-    echo "directory /opt/grobid/grobid-home not found, copying from source..."
-    cp -ar "/opt/grobid-source/grobid-home" "/opt/grobid/grobid-home"
-fi
-
-java ${JAVA_OPTS} -jar grobid-trainer-onejar.jar \
-    0 header \
-    -gH /opt/grobid/grobid-home \
-    $@
-
-if [ ! -z "${CLOUD_MODELS_PATH}" ]; then
-    upload-header-model.sh "${CLOUD_MODELS_PATH}"
-fi
+"${SCRIPT_HOME}/train-model.sh" --model "header" $@
diff --git a/scripts/train-model.sh b/scripts/train-model.sh
new file mode 100755
index 0000000..8c41add
--- /dev/null
+++ b/scripts/train-model.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+set -e
+
+SOURCE_DATASET_DIR="/opt/grobid-source/grobid-trainer/resources/dataset"
+TRAIN_DATASET_DIR="/opt/grobid/resources/dataset"
+CLOUD_MODELS_PATH="${CLOUD_MODELS_PATH}"
+
+DATASETS=()
+MODEL_NAME=""
+
+echo "args: $@"
+
+POSITIONAL=()
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        --use-default-dataset)
+        DATASETS+=("$SOURCE_DATASET_DIR")
+        shift # past argument
+        ;;
+
+        --dataset)
+        DATASETS+=("$2")
+        shift # past argument
+        shift # past value
+        ;;
+
+        --model)
+        MODEL_NAME="$2"
+        shift # past argument
+        shift # past value
+        ;;
+
+        --cloud-models-path)
+        CLOUD_MODELS_PATH="$2"
+        shift # past argument
+        shift # past value
+        ;;
+
+        *)    # unknown option
+        POSITIONAL+=("$1")
+        shift # past argument
+        ;;
+    esac
+done
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+if [ -z "${DATASETS}" ]; then
+    echo "Error: no datasets enabled"
+    exit 1
+fi
+
+if [ -z "${MODEL_NAME}" ]; then
+    echo "Error: --model required"
+    exit 1
+fi
+
+echo "DATASETS=${DATASETS[@]}"
+
+if [ "${MODEL_NAME}" == "segmentation" ]; then
+    sub_dirs=(
+        "segmentation/corpus/raw"
+        "segmentation/corpus/tei"
+    )
+elif [ "${MODEL_NAME}" == "header" ]; then
+    sub_dirs=(
+        "header/corpus/headers"
+        "header/corpus/tei"
+    )
+else
+    echo "Unsupported model: ${MODEL_NAME}"
+    exit 2
+fi
+
+rm -rf "${TRAIN_DATASET_DIR}/${MODEL_NAME}"
+mkdir -p "${TRAIN_DATASET_DIR}/${MODEL_NAME}"
+cp -ar "${SOURCE_DATASET_DIR}/${MODEL_NAME}/crfpp-templates" "$TRAIN_DATASET_DIR/${MODEL_NAME}/crfpp-templates"
+
+for dataset in ${DATASETS[@]}; do
+    echo "dataset=$dataset"
+    for sub_dir in "${sub_dirs[@]}"; do
+        echo "copying ${dataset}/${sub_dir}..."
+        mkdir -p "${TRAIN_DATASET_DIR}/${sub_dir}"
+        gsutil -m cp "${dataset}/${sub_dir}/*" "${TRAIN_DATASET_DIR}/${sub_dir}/"
+        gunzip -f "${TRAIN_DATASET_DIR}/${sub_dir}/"*.gz || true
+    done
+done
+
+ls -l --recursive "${TRAIN_DATASET_DIR}/${MODEL_NAME}"
+
+if [ ! -d "/opt/grobid/grobid-home" ]; then
+    echo "directory /opt/grobid/grobid-home not found, copying from source..."
+    cp -ar "/opt/grobid-source/grobid-home" "/data/grobid-home"
+    ln -s "/data/grobid-home" "/opt/grobid/grobid-home"
+fi
+
+java ${JAVA_OPTS} -jar grobid-trainer-onejar.jar \
+    0 "${MODEL_NAME}" \
+    -gH /opt/grobid/grobid-home \
+    $@
+
+if [ ! -z "${CLOUD_MODELS_PATH}" ]; then
+    upload-model.sh "${CLOUD_MODELS_PATH}" "${MODEL_NAME}"
+fi
diff --git a/scripts/upload-header-model.sh b/scripts/upload-header-model.sh
index 7faa55e..86cefcd 100755
--- a/scripts/upload-header-model.sh
+++ b/scripts/upload-header-model.sh
@@ -2,16 +2,6 @@
 
 set -e
 
-CLOUD_MODELS_PATH=${1:-$CLOUD_MODELS_PATH}
+SCRIPT_HOME="$(dirname "$0")"
 
-if [ -z "${CLOUD_MODELS_PATH}" ]; then
-    echo "Error: CLOUD_MODELS_PATH required"
-    exit 1
-fi
-
-echo "uploading header model to ${CLOUD_MODELS_PATH}"
-
-gsutil cp -Z "/opt/grobid/grobid-home/models/header/model.wapiti" \
-    "${CLOUD_MODELS_PATH}/header/model.wapiti.gz"
-
-gsutil ls -l "${CLOUD_MODELS_PATH}/header"
+MODEL_NAME=header "${SCRIPT_HOME}/upload-model.sh" $@
diff --git a/scripts/upload-model.sh b/scripts/upload-model.sh
new file mode 100755
index 0000000..3ff3957
--- /dev/null
+++ b/scripts/upload-model.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+
+CLOUD_MODELS_PATH=${1:-$CLOUD_MODELS_PATH}
+MODEL_NAME=${2:-$MODEL_NAME}
+
+if [ -z "${CLOUD_MODELS_PATH}" ]; then
+    echo "Error: CLOUD_MODELS_PATH required"
+    exit 1
+fi
+
+if [ -z "${MODEL_NAME}" ]; then
+    echo "Error: MODEL_NAME required"
+    exit 1
+fi
+
+GROBID_HOME="/opt/grobid/grobid-home"
+if [ ! -d "${GROBID_HOME}" ]; then
+    GROBID_HOME="/data/grobid-home"
+fi
+
+if [ ! -d "${GROBID_HOME}" ]; then
+    echo "no grobid home found (have you trained a model yet?)"
+fi
+
+echo "uploading ${MODEL_NAME} model to ${CLOUD_MODELS_PATH}"
+
+gsutil cp -Z "${GROBID_HOME}/models/${MODEL_NAME}/model.wapiti" \
+    "${CLOUD_MODELS_PATH}/${MODEL_NAME}/model.wapiti.gz"
+
+gsutil ls -l "${CLOUD_MODELS_PATH}/${MODEL_NAME}"
-- 
GitLab