From 4bb5ccdf4cda167ce85511e83d74d7dacd32ac19 Mon Sep 17 00:00:00 2001 From: Daniel Ecer <de-code@users.noreply.github.com> Date: Fri, 23 Aug 2019 16:34:37 +0100 Subject: [PATCH] train segmentation model (#9) * train and upload segmentation model * keep grobid-home on data volume * fixed train-segmentation-model-with-dataset executable --- Makefile | 33 +++++++++++ scripts/train-header-model.sh | 73 +---------------------- scripts/train-model.sh | 105 +++++++++++++++++++++++++++++++++ scripts/upload-header-model.sh | 14 +---- scripts/upload-model.sh | 32 ++++++++++ 5 files changed, 174 insertions(+), 83 deletions(-) create mode 100755 scripts/train-model.sh create mode 100755 scripts/upload-model.sh diff --git a/Makefile b/Makefile index d592193..34ca53e 100644 --- a/Makefile +++ b/Makefile @@ -83,6 +83,39 @@ upload-header-model: build $(RUN) upload-header-model.sh "$(CLOUD_MODELS_PATH)" +copy-raw-segmentation-training-data-to-tei: build + $(RUN) bash -c '\ + mkdir -p "$(DATASET_DIR)/segmentation/corpus/tei" && \ + cp "$(DATASET_DIR)/segmentation/corpus/tei-raw/"*.xml "$(DATASET_DIR)/segmentation/corpus/tei/" \ + ' + + +train-segmentation-model-with-dataset: build + $(RUN) train-model.sh \ + --dataset "$(DATASET_DIR)" \ + --model segmentation \ + $(TRAIN_ARGS) + + +train-segmentation-model-with-default-dataset: build + $(RUN) train-model.sh \ + --use-default-dataset \ + --model segmentation \ + $(TRAIN_ARGS) + + +train-segmentation-model-with-dataset-and-default-dataset: build + $(RUN) train-model.sh \ + --dataset "$(DATASET_DIR)" \ + --use-default-dataset \ + --model segmentation \ + $(TRAIN_ARGS) + + +upload-segmentation-model: build + $(RUN) upload-model.sh "$(CLOUD_MODELS_PATH)" "segmentation" + + shell: build $(RUN) bash diff --git a/scripts/train-header-model.sh b/scripts/train-header-model.sh index 5429971..36fca99 100755 --- a/scripts/train-header-model.sh +++ b/scripts/train-header-model.sh @@ -2,75 +2,6 @@ set -e -SOURCE_DATASET_DIR="/opt/grobid-source/grobid-trainer/resources/dataset" -TRAIN_DATASET_DIR="/opt/grobid/resources/dataset" -CLOUD_MODELS_PATH="${CLOUD_MODELS_PATH}" +SCRIPT_HOME="$(dirname "$0")" -DATASETS=() - -echo "args: $@" - -POSITIONAL=() -while [[ $# -gt 0 ]]; do - key="$1" - case $key in - --use-default-dataset) - DATASETS+=("$SOURCE_DATASET_DIR") - shift # past argument - ;; - - --dataset) - DATASETS+=("$2") - shift # past argument - shift # past value - ;; - - --cloud-models-path) - CLOUD_MODELS_PATH="$2" - shift # past argument - shift # past value - ;; - - *) # unknown option - POSITIONAL+=("$1") - shift # past argument - ;; - esac -done -set -- "${POSITIONAL[@]}" # restore positional parameters - -if [ -z "${DATASETS}" ]; then - echo "Error: no datasets enabled" - exit 1 -fi - -echo "DATASETS=${DATASETS[@]}" - -rm -rf "${TRAIN_DATASET_DIR}/header" -mkdir -p "${TRAIN_DATASET_DIR}/header" -cp -ar "${SOURCE_DATASET_DIR}/header/crfpp-templates" "$TRAIN_DATASET_DIR/header/crfpp-templates" - -for dataset in ${DATASETS[@]}; do - echo "dataset=$dataset" - mkdir -p "${TRAIN_DATASET_DIR}/header/corpus/" - gsutil -m cp -r "${dataset}/header/corpus/headers" "${TRAIN_DATASET_DIR}/header/corpus/" - gsutil -m cp -r "${dataset}/header/corpus/tei" "${TRAIN_DATASET_DIR}/header/corpus/" - gunzip -f "${TRAIN_DATASET_DIR}/header/corpus/headers/"*.gz || true - gunzip -f "${TRAIN_DATASET_DIR}/header/corpus/tei/"*.gz || true -done - -ls -l --recursive "${TRAIN_DATASET_DIR}/header" - -if [ ! -d "/opt/grobid/grobid-home" ]; then - echo "directory /opt/grobid/grobid-home not found, copying from source..." - cp -ar "/opt/grobid-source/grobid-home" "/opt/grobid/grobid-home" -fi - -java ${JAVA_OPTS} -jar grobid-trainer-onejar.jar \ - 0 header \ - -gH /opt/grobid/grobid-home \ - $@ - -if [ ! -z "${CLOUD_MODELS_PATH}" ]; then - upload-header-model.sh "${CLOUD_MODELS_PATH}" -fi +"${SCRIPT_HOME}/train-model.sh" --model "header" $@ diff --git a/scripts/train-model.sh b/scripts/train-model.sh new file mode 100755 index 0000000..8c41add --- /dev/null +++ b/scripts/train-model.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +set -e + +SOURCE_DATASET_DIR="/opt/grobid-source/grobid-trainer/resources/dataset" +TRAIN_DATASET_DIR="/opt/grobid/resources/dataset" +CLOUD_MODELS_PATH="${CLOUD_MODELS_PATH}" + +DATASETS=() +MODEL_NAME="" + +echo "args: $@" + +POSITIONAL=() +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --use-default-dataset) + DATASETS+=("$SOURCE_DATASET_DIR") + shift # past argument + ;; + + --dataset) + DATASETS+=("$2") + shift # past argument + shift # past value + ;; + + --model) + MODEL_NAME="$2" + shift # past argument + shift # past value + ;; + + --cloud-models-path) + CLOUD_MODELS_PATH="$2" + shift # past argument + shift # past value + ;; + + *) # unknown option + POSITIONAL+=("$1") + shift # past argument + ;; + esac +done +set -- "${POSITIONAL[@]}" # restore positional parameters + +if [ -z "${DATASETS}" ]; then + echo "Error: no datasets enabled" + exit 1 +fi + +if [ -z "${MODEL_NAME}" ]; then + echo "Error: --model required" + exit 1 +fi + +echo "DATASETS=${DATASETS[@]}" + +if [ "${MODEL_NAME}" == "segmentation" ]; then + sub_dirs=( + "segmentation/corpus/raw" + "segmentation/corpus/tei" + ) +elif [ "${MODEL_NAME}" == "header" ]; then + sub_dirs=( + "header/corpus/headers" + "header/corpus/tei" + ) +else + echo "Unsupported model: ${MODEL_NAME}" + exit 2 +fi + +rm -rf "${TRAIN_DATASET_DIR}/${MODEL_NAME}" +mkdir -p "${TRAIN_DATASET_DIR}/${MODEL_NAME}" +cp -ar "${SOURCE_DATASET_DIR}/${MODEL_NAME}/crfpp-templates" "$TRAIN_DATASET_DIR/${MODEL_NAME}/crfpp-templates" + +for dataset in ${DATASETS[@]}; do + echo "dataset=$dataset" + for sub_dir in "${sub_dirs[@]}"; do + echo "copying ${dataset}/${sub_dir}..." + mkdir -p "${TRAIN_DATASET_DIR}/${sub_dir}" + gsutil -m cp "${dataset}/${sub_dir}/*" "${TRAIN_DATASET_DIR}/${sub_dir}/" + gunzip -f "${TRAIN_DATASET_DIR}/${sub_dir}/"*.gz || true + done +done + +ls -l --recursive "${TRAIN_DATASET_DIR}/${MODEL_NAME}" + +if [ ! -d "/opt/grobid/grobid-home" ]; then + echo "directory /opt/grobid/grobid-home not found, copying from source..." + cp -ar "/opt/grobid-source/grobid-home" "/data/grobid-home" + ln -s "/data/grobid-home" "/opt/grobid/grobid-home" +fi + +java ${JAVA_OPTS} -jar grobid-trainer-onejar.jar \ + 0 "${MODEL_NAME}" \ + -gH /opt/grobid/grobid-home \ + $@ + +if [ ! -z "${CLOUD_MODELS_PATH}" ]; then + upload-model.sh "${CLOUD_MODELS_PATH}" "${MODEL_NAME}" +fi diff --git a/scripts/upload-header-model.sh b/scripts/upload-header-model.sh index 7faa55e..86cefcd 100755 --- a/scripts/upload-header-model.sh +++ b/scripts/upload-header-model.sh @@ -2,16 +2,6 @@ set -e -CLOUD_MODELS_PATH=${1:-$CLOUD_MODELS_PATH} +SCRIPT_HOME="$(dirname "$0")" -if [ -z "${CLOUD_MODELS_PATH}" ]; then - echo "Error: CLOUD_MODELS_PATH required" - exit 1 -fi - -echo "uploading header model to ${CLOUD_MODELS_PATH}" - -gsutil cp -Z "/opt/grobid/grobid-home/models/header/model.wapiti" \ - "${CLOUD_MODELS_PATH}/header/model.wapiti.gz" - -gsutil ls -l "${CLOUD_MODELS_PATH}/header" +MODEL_NAME=header "${SCRIPT_HOME}/upload-model.sh" $@ diff --git a/scripts/upload-model.sh b/scripts/upload-model.sh new file mode 100755 index 0000000..3ff3957 --- /dev/null +++ b/scripts/upload-model.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +set -e + +CLOUD_MODELS_PATH=${1:-$CLOUD_MODELS_PATH} +MODEL_NAME=${2:-$MODEL_NAME} + +if [ -z "${CLOUD_MODELS_PATH}" ]; then + echo "Error: CLOUD_MODELS_PATH required" + exit 1 +fi + +if [ -z "${MODEL_NAME}" ]; then + echo "Error: MODEL_NAME required" + exit 1 +fi + +GROBID_HOME="/opt/grobid/grobid-home" +if [ ! -d "${GROBID_HOME}" ]; then + GROBID_HOME="/data/grobid-home" +fi + +if [ ! -d "${GROBID_HOME}" ]; then + echo "no grobid home found (have you trained a model yet?)" +fi + +echo "uploading ${MODEL_NAME} model to ${CLOUD_MODELS_PATH}" + +gsutil cp -Z "${GROBID_HOME}/models/${MODEL_NAME}/model.wapiti" \ + "${CLOUD_MODELS_PATH}/${MODEL_NAME}/model.wapiti.gz" + +gsutil ls -l "${CLOUD_MODELS_PATH}/${MODEL_NAME}" -- GitLab