Skip to content
Snippets Groups Projects
Unverified Commit 98256f2c authored by Daniel Ecer's avatar Daniel Ecer Committed by GitHub
Browse files

copy and upload segmentation training data (#8)

parent 01d5bae8
No related branches found
No related tags found
No related merge requests found
......@@ -20,22 +20,45 @@ fi
echo "RAW_TRAINING_DATA_DIR=${RAW_TRAINING_DATA_DIR}"
echo "DATASET_DIR=${DATASET_DIR}"
header_headers_dir="$DATASET_DIR/header/corpus/headers"
header_tei_dir="$DATASET_DIR/header/corpus/tei-raw"
mkdir_clean() {
for dir in "$@"; do
echo "creating or cleaning directory: ${dir}"
mkdir -p "${dir}"
rm "${dir}"/* || true
done
}
mkdir -p "$header_headers_dir"
mkdir -p "$header_tei_dir"
copy_segmentation_files() {
segmentation_raw_dir="$DATASET_DIR/segmentation/corpus/raw"
segmentation_tei_dir="$DATASET_DIR/segmentation/corpus/tei-raw"
mkdir_clean "$segmentation_raw_dir" "${segmentation_tei_dir}"
rm "${header_headers_dir}"/* || true
rm "${header_tei_dir}"/* || true
echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_raw_dir"
cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation "$segmentation_raw_dir"
echo "renaming files $segmentation_raw_dir"
rename 's#\.training\.#\.#' "$segmentation_raw_dir"/*
echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir"
cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir"
echo "renaming files $header_headers_dir"
rename 's#\.training\.#\.#' "$header_headers_dir"/*
echo "copying files from $RAW_TRAINING_DATA_DIR to $segmentation_tei_dir"
cp -a "$RAW_TRAINING_DATA_DIR/"*.segmentation.tei.xml "$segmentation_tei_dir"
rename 's#\.training\.#\.#' "$segmentation_tei_dir"/*
}
echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir"
cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir"
rename 's#\.training\.#\.#' "$header_tei_dir"/*
copy_header_files() {
header_headers_dir="$DATASET_DIR/header/corpus/headers"
header_tei_dir="$DATASET_DIR/header/corpus/tei-raw"
mkdir_clean "$header_headers_dir" "${header_tei_dir}"
echo "copying files from $RAW_TRAINING_DATA_DIR to $header_headers_dir"
cp -a "$RAW_TRAINING_DATA_DIR/"*.header "$header_headers_dir"
echo "renaming files $header_headers_dir"
rename 's#\.training\.#\.#' "$header_headers_dir"/*
echo "copying files from $RAW_TRAINING_DATA_DIR to $header_tei_dir"
cp -a "$RAW_TRAINING_DATA_DIR/"*.header.tei.xml "$header_tei_dir"
rename 's#\.training\.#\.#' "$header_tei_dir"/*
}
copy_segmentation_files
copy_header_files
ls -l --recursive "${DATASET_DIR}"
......@@ -37,6 +37,10 @@ echo "CLOUD_DATATSET_PATH=${CLOUD_DATATSET_PATH}"
sub_dirs=(
"segmentation/corpus/raw"
"segmentation/corpus/tei"
"segmentation/corpus/tei-raw"
"segmentation/corpus/tei-auto"
"header/corpus/headers"
"header/corpus/tei"
"header/corpus/tei-raw"
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment