diff --git a/Makefile b/Makefile index 24030a5b58a1d70fa9f126092e1ca2d037ae131a..1935aacb592692414d6398a7ec446c794a65d9b9 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,9 @@ SAMPLE_PDF_URL = https://cdn.elifesciences.org/articles/32671/elife-32671-v2.pdf # Specify the location where to copy the model to CLOUD_MODELS_PATH = +# Specify the location where to copy the dataset to +CLOUD_DATATSET_PATH = + NOT_SLOW_PYTEST_ARGS = -m 'not slow' ARGS = @@ -117,6 +120,12 @@ generate-grobid-training-data: "$(DATASET_DIR)" +upload-dataset: + $(RUN) upload-dataset.sh \ + "${DATASET_DIR}" \ + "$(CLOUD_DATATSET_PATH)" + + copy-raw-header-training-data-to-tei: $(RUN) bash -c '\ mkdir -p "$(DATASET_DIR)/header/corpus/tei" && \ diff --git a/scripts/upload-dataset.sh b/scripts/upload-dataset.sh index 860b3074cc63bf431eda27fe4c1e12fd09aa3534..7fe9cbe1096bfd0d934535edd4991fc52bf4d1c6 100755 --- a/scripts/upload-dataset.sh +++ b/scripts/upload-dataset.sh @@ -45,6 +45,33 @@ sub_dirs=( "header/corpus/tei" "header/corpus/tei-raw" "header/corpus/tei-auto" + "fulltext/corpus/raw" + "fulltext/corpus/tei" + "fulltext/corpus/tei-raw" + "fulltext/corpus/tei-auto" + "figure/corpus/raw" + "figure/corpus/tei" + "figure/corpus/tei-raw" + "figure/corpus/tei-auto" + "reference-segmenter/corpus/raw" + "reference-segmenter/corpus/tei" + "reference-segmenter/corpus/tei-raw" + "reference-segmenter/corpus/tei-auto" + "affiliation-address/corpus" + "affiliation-address/corpus-raw" + "affiliation-address/corpus-auto" + "citation/corpus" + "citation/corpus-raw" + "citation/corpus-auto" + "name/citation/corpus" + "name/citation/corpus-raw" + "name/citation/corpus-auto" + "name/header/corpus" + "name/header/corpus-raw" + "name/header/corpus-auto" + "date/corpus" + "date/corpus-raw" + "date/corpus-auto" "xml" ) for sub_dir in "${sub_dirs[@]}"; do