diff --git a/Makefile b/Makefile index 1935aacb592692414d6398a7ec446c794a65d9b9..794aa8f0a55ff4eae03d5b84deaa84daf10990c3 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ TRAIN_ARGS = USER_AGENT = Dummy/user-agent SAMPLE_PDF_URL = https://cdn.elifesciences.org/articles/32671/elife-32671-v2.pdf +SAMPLE_PDF_URL_2 = https://www.biorxiv.org/content/10.1101/452433v1.full.pdf # Specify the location where to copy the model to CLOUD_MODELS_PATH = @@ -110,6 +111,8 @@ get-example-data: mkdir -p "$(PDF_DATA_DIR)" \ && curl --fail --show-error --connect-timeout 60 --user-agent "$(USER_AGENT)" --location \ "$(SAMPLE_PDF_URL)" --silent -o "$(PDF_DATA_DIR)/sample.pdf" \ + && curl --fail --show-error --connect-timeout 60 --user-agent "$(USER_AGENT)" --location \ + "$(SAMPLE_PDF_URL_2)" --silent -o "$(PDF_DATA_DIR)/sample_2.pdf" \ && ls -l "$(PDF_DATA_DIR)" \ ' @@ -255,6 +258,39 @@ upload-figure-model: $(RUN) upload-model.sh "$(CLOUD_MODELS_PATH)" "figure" +copy-raw-table-training-data-to-tei: + $(RUN) bash -c '\ + mkdir -p "$(DATASET_DIR)/table/corpus/tei" && \ + cp "$(DATASET_DIR)/table/corpus/tei-raw/"*.xml "$(DATASET_DIR)/table/corpus/tei/" \ + ' + + +train-table-model-with-dataset: + $(RUN) train-model.sh \ + --dataset "$(DATASET_DIR)" \ + --model table \ + $(TRAIN_ARGS) + + +train-table-model-with-default-dataset: + $(RUN) train-model.sh \ + --use-default-dataset \ + --model table \ + $(TRAIN_ARGS) + + +train-table-model-with-dataset-and-default-dataset: + $(RUN) train-model.sh \ + --dataset "$(DATASET_DIR)" \ + --use-default-dataset \ + --model table \ + $(TRAIN_ARGS) + + +upload-table-model: + $(RUN) upload-model.sh "$(CLOUD_MODELS_PATH)" "table" + + copy-raw-reference-segmenter-training-data-to-tei: $(RUN) bash -c '\ mkdir -p "$(DATASET_DIR)/reference-segmenter/corpus/tei" && \ diff --git a/scripts/copy-raw-training-data-to-file-structure.sh b/scripts/copy-raw-training-data-to-file-structure.sh index ef8a49f86c98fbb3edfe99efdf29a185123dcbb9..c9fcad2eb1991ff2ada9aaa1d77f31bf0e4e8d69 100755 --- a/scripts/copy-raw-training-data-to-file-structure.sh +++ b/scripts/copy-raw-training-data-to-file-structure.sh @@ -20,138 +20,151 @@ fi echo "RAW_TRAINING_DATA_DIR=${RAW_TRAINING_DATA_DIR}" echo "DATASET_DIR=${DATASET_DIR}" +has_matching_files() { + local dir="$1" + local pattern="$2" + if test -n "$(find "${dir}" -maxdepth 1 -type f -name "${pattern}" -print -quit)"; then + # echo "files exist: $dir $pattern" + return + fi + # echo "files do not exist: $dir $pattern" + false +} + mkdir_clean() { for dir in "$@"; do echo "creating or cleaning directory: ${dir}" - mkdir -p "${dir}" - rm "${dir}"/* || true + if [ -d "${dir}" ]; then + if has_matching_files "${dir}" "*"; then + rm "${dir}"/* || true + fi + else + mkdir -p "${dir}" + fi done } +copy_and_rename_tei_and_raw_training_files() { + local tei_dir="$1" + local tei_pattern="$2" + local raw_dir="$3" + local raw_pattern="$4" + + if ! has_matching_files "$RAW_TRAINING_DATA_DIR" "${tei_pattern}"; then + echo "no ${tei_pattern} data" + return + fi + + mkdir_clean "${tei_dir}" "${raw_dir}" + + echo "copying files from $RAW_TRAINING_DATA_DIR (${raw_pattern}) to $raw_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"${raw_pattern} "$raw_dir" + rename 's#\.training\.#\.#' "$raw_dir"/* + + echo "copying files from $RAW_TRAINING_DATA_DIR (${tei_pattern}) to $tei_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"${tei_pattern} "$tei_dir" + rename 's#\.training\.#\.#' "$tei_dir"/* +} + copy_segmentation_files() { - segmentation_raw_dir="$DATASET_DIR/segmentation/corpus/raw" - segmentation_tei_dir="$DATASET_DIR/segmentation/corpus/tei-raw" - mkdir_clean "$segmentation_raw_dir" "${segmentation_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_raw_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation "$segmentation_raw_dir" - echo "renaming files $segmentation_raw_dir" - rename 's#\.training\.#\.#' "$segmentation_raw_dir"/* - - echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation.tei.xml "$segmentation_tei_dir" - rename 's#\.training\.#\.#' "$segmentation_tei_dir"/* + copy_and_rename_tei_and_raw_training_files \ + "$DATASET_DIR/segmentation/corpus/tei-raw" \ + "*.segmentation.tei.xml" \ + "$DATASET_DIR/segmentation/corpus/raw" \ + "*.segmentation" } copy_header_files() { - header_headers_dir="$DATASET_DIR/header/corpus/headers" - header_tei_dir="$DATASET_DIR/header/corpus/tei-raw" - mkdir_clean "$header_headers_dir" "${header_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir" - echo "renaming files $header_headers_dir" - rename 's#\.training\.#\.#' "$header_headers_dir"/* - - echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir" - rename 's#\.training\.#\.#' "$header_tei_dir"/* + copy_and_rename_tei_and_raw_training_files \ + "$DATASET_DIR/header/corpus/tei-raw" \ + "*.header.tei.xml" \ + "$DATASET_DIR/header/corpus/headers" \ + "*.header" } copy_fulltext_files() { - fulltext_raw_dir="$DATASET_DIR/fulltext/corpus/raw" - fulltext_tei_dir="$DATASET_DIR/fulltext/corpus/tei-raw" - mkdir_clean "$fulltext_raw_dir" "${fulltext_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $fulltext_raw_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.fulltext "$fulltext_raw_dir" - echo "renaming files $fulltext_raw_dir" - rename 's#\.training\.#\.#' "$fulltext_raw_dir"/* - - echo "copying files from $RAW_TRAINING_DATA_DIR to $fulltext_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.fulltext.tei.xml "$fulltext_tei_dir" - rename 's#\.training\.#\.#' "$fulltext_tei_dir"/* + copy_and_rename_tei_and_raw_training_files \ + "$DATASET_DIR/fulltext/corpus/tei-raw" \ + "*.fulltext.tei.xml" \ + "$DATASET_DIR/fulltext/corpus/raw" \ + "*.fulltext" } copy_figure_files() { - figure_raw_dir="$DATASET_DIR/figure/corpus/raw" - figure_tei_dir="$DATASET_DIR/figure/corpus/tei-raw" - mkdir_clean "$figure_raw_dir" "${figure_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $figure_raw_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.figure "$figure_raw_dir" - echo "renaming files $figure_raw_dir" - rename 's#\.training\.#\.#' "$figure_raw_dir"/* - - echo "copying files from $RAW_TRAINING_DATA_DIR to $figure_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.figure.tei.xml "$figure_tei_dir" - rename 's#\.training\.#\.#' "$figure_tei_dir"/* + copy_and_rename_tei_and_raw_training_files \ + "$DATASET_DIR/figure/corpus/tei-raw" \ + "*.figure.tei.xml" \ + "$DATASET_DIR/figure/corpus/raw" \ + "*.figure" +} + +copy_table_files() { + copy_and_rename_tei_and_raw_training_files \ + "$DATASET_DIR/table/corpus/tei-raw" \ + "*.table.tei.xml" \ + "$DATASET_DIR/table/corpus/raw" \ + "*.table" } copy_reference_segmenter_files() { - reference_segmenter_raw_dir="$DATASET_DIR/reference-segmenter/corpus/raw" - reference_segmenter_tei_dir="$DATASET_DIR/reference-segmenter/corpus/tei-raw" - mkdir_clean "$reference_segmenter_raw_dir" "${reference_segmenter_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $reference_segmenter_raw_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.referenceSegmenter "$reference_segmenter_raw_dir" - echo "renaming files $reference_segmenter_raw_dir" - rename 's#\.training\.#\.#' "$reference_segmenter_raw_dir"/* - - echo "copying files from $RAW_TRAINING_DATA_DIR to $reference_segmenter_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.referenceSegmenter.tei.xml "$reference_segmenter_tei_dir" - rename 's#\.training\.#\.#' "$reference_segmenter_tei_dir"/* + copy_and_rename_tei_and_raw_training_files \ + "$DATASET_DIR/reference-segmenter/corpus/tei-raw" \ + "*.referenceSegmenter.tei.xml" \ + "$DATASET_DIR/reference-segmenter/corpus/raw" \ + "*.referenceSegmenter" } -copy_affiliation_address_files() { - affiliation_address_tei_dir="$DATASET_DIR/affiliation-address/corpus-raw" - mkdir_clean "${affiliation_address_tei_dir}" +copy_and_rename_tei_only_training_files() { + local tei_dir="$1" + local pattern="$2" + + if ! has_matching_files "$RAW_TRAINING_DATA_DIR" "${pattern}"; then + echo "no ${pattern} data" + return + fi + + mkdir_clean "${tei_dir}" - echo "copying files from $RAW_TRAINING_DATA_DIR to $affiliation_address_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.references.tei.xml "$affiliation_address_tei_dir" - rename 's#\.training\.header\.#\.#' "$affiliation_address_tei_dir"/* + echo "copying files from $RAW_TRAINING_DATA_DIR (${pattern}) to $tei_dir" + cp -a "$RAW_TRAINING_DATA_DIR/"${pattern} "$tei_dir" + rename 's#\.training\.#\.#' "$tei_dir"/* } -copy_citation_files() { - citation_tei_dir="$DATASET_DIR/citation/corpus-raw" - mkdir_clean "${citation_tei_dir}" +copy_affiliation_address_files() { + copy_and_rename_tei_only_training_files \ + "$DATASET_DIR/affiliation-address/corpus-raw" \ + "*.affiliation.tei.xml" +} - echo "copying files from $RAW_TRAINING_DATA_DIR to $citation_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.references.tei.xml "$citation_tei_dir" - rename 's#\.training\.#\.#' "$citation_tei_dir"/* +copy_citation_files() { + copy_and_rename_tei_only_training_files \ + "$DATASET_DIR/citation/corpus-raw" \ + "*.references.tei.xml" } copy_name_citation_files() { - name_citation_tei_dir="$DATASET_DIR/name/citation/corpus-raw" - mkdir_clean "${name_citation_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $name_citation_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.references.authors.tei.xml "$name_citation_tei_dir" - rename 's#\.training\.#\.#' "$name_citation_tei_dir"/* + copy_and_rename_tei_only_training_files \ + "$DATASET_DIR/name/citation/corpus-raw" \ + "*.references.authors.tei.xml" } copy_name_header_files() { - name_header_tei_dir="$DATASET_DIR/name/header/corpus-raw" - mkdir_clean "${name_header_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $name_header_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.header.authors.tei.xml "$name_header_tei_dir" - rename 's#\.training\.#\.#' "$name_header_tei_dir"/* + copy_and_rename_tei_only_training_files \ + "$DATASET_DIR/name/header/corpus-raw" \ + "*.header.authors.tei.xml" } copy_date_files() { - date_tei_dir="$DATASET_DIR/date/corpus-raw" - mkdir_clean "${date_tei_dir}" - - echo "copying files from $RAW_TRAINING_DATA_DIR to $date_tei_dir" - cp -a "$RAW_TRAINING_DATA_DIR/"*.header.date.xml "$date_tei_dir" - rename 's#\.training\.header\.#\.#' "$date_tei_dir"/* + copy_and_rename_tei_only_training_files \ + "$DATASET_DIR/date/corpus-raw" \ + "*.header.date.xml" } copy_segmentation_files copy_header_files copy_fulltext_files copy_figure_files +copy_table_files copy_reference_segmenter_files copy_affiliation_address_files copy_citation_files diff --git a/scripts/train-model.sh b/scripts/train-model.sh index a4d8c5ff1ab6aa1b143ef3416d8c670ab5fba215..5080f3840d5583ef224e8957b46084c692e16249 100755 --- a/scripts/train-model.sh +++ b/scripts/train-model.sh @@ -80,6 +80,11 @@ elif [ "${MODEL_NAME}" == "figure" ]; then "figure/corpus/raw" "figure/corpus/tei" ) +elif [ "${MODEL_NAME}" == "table" ]; then + sub_dirs=( + "table/corpus/raw" + "table/corpus/tei" + ) elif [ "${MODEL_NAME}" == "reference-segmenter" ]; then sub_dirs=( "reference-segmenter/corpus/raw" diff --git a/scripts/upload-dataset.sh b/scripts/upload-dataset.sh index 7fe9cbe1096bfd0d934535edd4991fc52bf4d1c6..2e8254c6bc8490513bc87c11210c2094450ae554 100755 --- a/scripts/upload-dataset.sh +++ b/scripts/upload-dataset.sh @@ -53,6 +53,10 @@ sub_dirs=( "figure/corpus/tei" "figure/corpus/tei-raw" "figure/corpus/tei-auto" + "table/corpus/raw" + "table/corpus/tei" + "table/corpus/tei-raw" + "table/corpus/tei-auto" "reference-segmenter/corpus/raw" "reference-segmenter/corpus/tei" "reference-segmenter/corpus/tei-raw"