Skip to content
Snippets Groups Projects
Unverified Commit 4bb5ccdf authored by Daniel Ecer's avatar Daniel Ecer Committed by GitHub
Browse files

train segmentation model (#9)

* train and upload segmentation model

* keep grobid-home on data volume

* fixed train-segmentation-model-with-dataset executable
parent 98256f2c
No related branches found
No related tags found
No related merge requests found
......@@ -83,6 +83,39 @@ upload-header-model: build
$(RUN) upload-header-model.sh "$(CLOUD_MODELS_PATH)"
copy-raw-segmentation-training-data-to-tei: build
$(RUN) bash -c '\
mkdir -p "$(DATASET_DIR)/segmentation/corpus/tei" && \
cp "$(DATASET_DIR)/segmentation/corpus/tei-raw/"*.xml "$(DATASET_DIR)/segmentation/corpus/tei/" \
'
train-segmentation-model-with-dataset: build
$(RUN) train-model.sh \
--dataset "$(DATASET_DIR)" \
--model segmentation \
$(TRAIN_ARGS)
train-segmentation-model-with-default-dataset: build
$(RUN) train-model.sh \
--use-default-dataset \
--model segmentation \
$(TRAIN_ARGS)
train-segmentation-model-with-dataset-and-default-dataset: build
$(RUN) train-model.sh \
--dataset "$(DATASET_DIR)" \
--use-default-dataset \
--model segmentation \
$(TRAIN_ARGS)
upload-segmentation-model: build
$(RUN) upload-model.sh "$(CLOUD_MODELS_PATH)" "segmentation"
shell: build
$(RUN) bash
......
......@@ -2,75 +2,6 @@
set -e
SOURCE_DATASET_DIR="/opt/grobid-source/grobid-trainer/resources/dataset"
TRAIN_DATASET_DIR="/opt/grobid/resources/dataset"
CLOUD_MODELS_PATH="${CLOUD_MODELS_PATH}"
SCRIPT_HOME="$(dirname "$0")"
DATASETS=()
echo "args: $@"
POSITIONAL=()
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--use-default-dataset)
DATASETS+=("$SOURCE_DATASET_DIR")
shift # past argument
;;
--dataset)
DATASETS+=("$2")
shift # past argument
shift # past value
;;
--cloud-models-path)
CLOUD_MODELS_PATH="$2"
shift # past argument
shift # past value
;;
*) # unknown option
POSITIONAL+=("$1")
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
if [ -z "${DATASETS}" ]; then
echo "Error: no datasets enabled"
exit 1
fi
echo "DATASETS=${DATASETS[@]}"
rm -rf "${TRAIN_DATASET_DIR}/header"
mkdir -p "${TRAIN_DATASET_DIR}/header"
cp -ar "${SOURCE_DATASET_DIR}/header/crfpp-templates" "$TRAIN_DATASET_DIR/header/crfpp-templates"
for dataset in ${DATASETS[@]}; do
echo "dataset=$dataset"
mkdir -p "${TRAIN_DATASET_DIR}/header/corpus/"
gsutil -m cp -r "${dataset}/header/corpus/headers" "${TRAIN_DATASET_DIR}/header/corpus/"
gsutil -m cp -r "${dataset}/header/corpus/tei" "${TRAIN_DATASET_DIR}/header/corpus/"
gunzip -f "${TRAIN_DATASET_DIR}/header/corpus/headers/"*.gz || true
gunzip -f "${TRAIN_DATASET_DIR}/header/corpus/tei/"*.gz || true
done
ls -l --recursive "${TRAIN_DATASET_DIR}/header"
if [ ! -d "/opt/grobid/grobid-home" ]; then
echo "directory /opt/grobid/grobid-home not found, copying from source..."
cp -ar "/opt/grobid-source/grobid-home" "/opt/grobid/grobid-home"
fi
java ${JAVA_OPTS} -jar grobid-trainer-onejar.jar \
0 header \
-gH /opt/grobid/grobid-home \
$@
if [ ! -z "${CLOUD_MODELS_PATH}" ]; then
upload-header-model.sh "${CLOUD_MODELS_PATH}"
fi
"${SCRIPT_HOME}/train-model.sh" --model "header" $@
#!/bin/bash
set -e
SOURCE_DATASET_DIR="/opt/grobid-source/grobid-trainer/resources/dataset"
TRAIN_DATASET_DIR="/opt/grobid/resources/dataset"
CLOUD_MODELS_PATH="${CLOUD_MODELS_PATH}"
DATASETS=()
MODEL_NAME=""
echo "args: $@"
POSITIONAL=()
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--use-default-dataset)
DATASETS+=("$SOURCE_DATASET_DIR")
shift # past argument
;;
--dataset)
DATASETS+=("$2")
shift # past argument
shift # past value
;;
--model)
MODEL_NAME="$2"
shift # past argument
shift # past value
;;
--cloud-models-path)
CLOUD_MODELS_PATH="$2"
shift # past argument
shift # past value
;;
*) # unknown option
POSITIONAL+=("$1")
shift # past argument
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
if [ -z "${DATASETS}" ]; then
echo "Error: no datasets enabled"
exit 1
fi
if [ -z "${MODEL_NAME}" ]; then
echo "Error: --model required"
exit 1
fi
echo "DATASETS=${DATASETS[@]}"
if [ "${MODEL_NAME}" == "segmentation" ]; then
sub_dirs=(
"segmentation/corpus/raw"
"segmentation/corpus/tei"
)
elif [ "${MODEL_NAME}" == "header" ]; then
sub_dirs=(
"header/corpus/headers"
"header/corpus/tei"
)
else
echo "Unsupported model: ${MODEL_NAME}"
exit 2
fi
rm -rf "${TRAIN_DATASET_DIR}/${MODEL_NAME}"
mkdir -p "${TRAIN_DATASET_DIR}/${MODEL_NAME}"
cp -ar "${SOURCE_DATASET_DIR}/${MODEL_NAME}/crfpp-templates" "$TRAIN_DATASET_DIR/${MODEL_NAME}/crfpp-templates"
for dataset in ${DATASETS[@]}; do
echo "dataset=$dataset"
for sub_dir in "${sub_dirs[@]}"; do
echo "copying ${dataset}/${sub_dir}..."
mkdir -p "${TRAIN_DATASET_DIR}/${sub_dir}"
gsutil -m cp "${dataset}/${sub_dir}/*" "${TRAIN_DATASET_DIR}/${sub_dir}/"
gunzip -f "${TRAIN_DATASET_DIR}/${sub_dir}/"*.gz || true
done
done
ls -l --recursive "${TRAIN_DATASET_DIR}/${MODEL_NAME}"
if [ ! -d "/opt/grobid/grobid-home" ]; then
echo "directory /opt/grobid/grobid-home not found, copying from source..."
cp -ar "/opt/grobid-source/grobid-home" "/data/grobid-home"
ln -s "/data/grobid-home" "/opt/grobid/grobid-home"
fi
java ${JAVA_OPTS} -jar grobid-trainer-onejar.jar \
0 "${MODEL_NAME}" \
-gH /opt/grobid/grobid-home \
$@
if [ ! -z "${CLOUD_MODELS_PATH}" ]; then
upload-model.sh "${CLOUD_MODELS_PATH}" "${MODEL_NAME}"
fi
......@@ -2,16 +2,6 @@
set -e
CLOUD_MODELS_PATH=${1:-$CLOUD_MODELS_PATH}
SCRIPT_HOME="$(dirname "$0")"
if [ -z "${CLOUD_MODELS_PATH}" ]; then
echo "Error: CLOUD_MODELS_PATH required"
exit 1
fi
echo "uploading header model to ${CLOUD_MODELS_PATH}"
gsutil cp -Z "/opt/grobid/grobid-home/models/header/model.wapiti" \
"${CLOUD_MODELS_PATH}/header/model.wapiti.gz"
gsutil ls -l "${CLOUD_MODELS_PATH}/header"
MODEL_NAME=header "${SCRIPT_HOME}/upload-model.sh" $@
#!/bin/bash
set -e
CLOUD_MODELS_PATH=${1:-$CLOUD_MODELS_PATH}
MODEL_NAME=${2:-$MODEL_NAME}
if [ -z "${CLOUD_MODELS_PATH}" ]; then
echo "Error: CLOUD_MODELS_PATH required"
exit 1
fi
if [ -z "${MODEL_NAME}" ]; then
echo "Error: MODEL_NAME required"
exit 1
fi
GROBID_HOME="/opt/grobid/grobid-home"
if [ ! -d "${GROBID_HOME}" ]; then
GROBID_HOME="/data/grobid-home"
fi
if [ ! -d "${GROBID_HOME}" ]; then
echo "no grobid home found (have you trained a model yet?)"
fi
echo "uploading ${MODEL_NAME} model to ${CLOUD_MODELS_PATH}"
gsutil cp -Z "${GROBID_HOME}/models/${MODEL_NAME}/model.wapiti" \
"${CLOUD_MODELS_PATH}/${MODEL_NAME}/model.wapiti.gz"
gsutil ls -l "${CLOUD_MODELS_PATH}/${MODEL_NAME}"
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment