... | ... | @@ -13,19 +13,18 @@ Ultimately the docs here may belong on a separate page (if/as the pipeline is us |
|
|
|
|
|
# $DOCXdocumentXML is the 'word/document.xml' file extracted (unzipped) from a .docx file
|
|
|
# (Also, its neighbor files from the .docx package should be available.)
|
|
|
|
|
|
DOCXdocumentXML="path/to/docx/word/document.xml"
|
|
|
|
|
|
# $FILE is a short identifier
|
|
|
|
|
|
FILE="Zorba"
|
|
|
|
|
|
saxonHE="java -jar:path/to/saxon.jar"
|
|
|
EXTRACT="docx-html-extract1.xsl'
|
|
|
REFINE1="handle-notes.xsl"
|
|
|
saxonHE="java -jar:path/to/saxon.jar" # SaxonHE (XSLT 2.0 processor)
|
|
|
EXTRACT="docx-html-extract1.xsl" # "Extraction" stylesheet
|
|
|
REFINE1="handle-notes.xsl" # "Refinement" stylesheets
|
|
|
REFINE2="scrub.xsl"
|
|
|
REFINE3="join-elements.xsl"
|
|
|
|
|
|
# Intermediate and final outputs (serializations) are all left on the file system
|
|
|
$saxonHE -xsl:$EXTRACT -s:$DOCXdocumentXML -o:$FILE-$EXTRACT_out.html
|
|
|
$saxonHE -xsl:$REFINE1 -s:$FILE-$EXTRACT_out.html -o:$FILE-$REFINE1_out.html
|
|
|
$saxonHE -xsl:$REFINE2 -s:$FILE-$REFINE1_out.html -o:$FILE-$REFINE2_out.html
|
... | ... | |