Commit 8263cf26 authored by Wendell Piez's avatar Wendell Piez

More documentation

parent 9f65a1e3
...@@ -9,6 +9,10 @@ ...@@ -9,6 +9,10 @@
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/> <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- XSweet: rewrites CSS, doing its best to promote CSS settings from style attributes to classes -->
<!-- Input: An HTML Typescript file. -->
<!-- Output: A copy, except rewritten wrt use of @style and @class. -->
<xsl:template match="node() | @*"> <xsl:template match="node() | @*">
<xsl:copy> <xsl:copy>
<xsl:apply-templates select="node() | @*"/> <xsl:apply-templates select="node() | @*"/>
......
#### css-abstract.xsl
XSLT stylesheet version 2.0 (6 templates)
XSweet: rewrites CSS, doing its best to promote CSS settings from style attributes to classes
Input: An HTML Typescript file.
Output: A copy, except rewritten wrt use of @style and @class.
\ No newline at end of file
...@@ -7,6 +7,11 @@ ...@@ -7,6 +7,11 @@
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/> <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- XSweet: 'wrapper' XSLT for docx extraction with cleanup; using this XSLT in a 3.0 processor replaces five calls to distinct XSLTs. [1] -->
<!-- Input: a WordML document.xml file as extracted from .docx input, with its related (neighbor) files in place -->
<!-- Output: HTML Typescript - fairly clean and regular HTML -->
<!-- Use this XQuery to get a list of stylesheets called by an XProc pipeline: <!-- Use this XQuery to get a list of stylesheets called by an XProc pipeline:
declare namespace p='http://www.w3.org/ns/xproc'; declare namespace p='http://www.w3.org/ns/xproc';
......
...@@ -11,6 +11,11 @@ ...@@ -11,6 +11,11 @@
<!-- Indent should really be no, but for testing. --> <!-- Indent should really be no, but for testing. -->
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/> <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- XSweet: Further removal of redundant expression of formatting properties, especially in service of subsequent
heuristics (where we need to see properties on paragraphs, not only their contents objects) .... [3e] -->
<!-- Input: A messy noisy HTML document needing (yet more and even more) streamlining and cleanup. -->
<!-- Output: A copy, with improvements. -->
<!-- Copy everything by default. --> <!-- Copy everything by default. -->
<xsl:template match="node() | @*"> <xsl:template match="node() | @*">
<xsl:copy> <xsl:copy>
......
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
xmlns="http://www.w3.org/1999/xhtml" xmlns:xsw="http://coko.foundation/xsweet" xmlns="http://www.w3.org/1999/xhtml" xmlns:xsw="http://coko.foundation/xsweet"
exclude-result-prefixes="#all"> exclude-result-prefixes="#all">
<!-- XSweet: an EXPERIMENTAL single-pass reduced docx extraction for further development. [ZZ]-->
<!-- For docs on WordML, see (at least): <!-- For docs on WordML, see (at least):
http://webapp.docx4java.org/OnlineDemo/ecma376/WordML/index.html http://webapp.docx4java.org/OnlineDemo/ecma376/WordML/index.html
......
...@@ -9,6 +9,8 @@ ...@@ -9,6 +9,8 @@
<xsl:import href="docx-html-extract.xsl"/> <xsl:import href="docx-html-extract.xsl"/>
<!-- XSweet: A utility XSLT, for wrapping a call to docx-extract.xsl in logic to unpack a document.xml from its zip (docx file wrapper). Thus saving unzipping when running diagnostics. -->
<!-- A 'shell' stylesheet, permitting us to pass a .docx file as an input parameter, <!-- A 'shell' stylesheet, permitting us to pass a .docx file as an input parameter,
using Java to retrieve the document.xml from inside it and process that file using Java to retrieve the document.xml from inside it and process that file
through imported templates (matching elements in the w: namespace), for "extraction" output. --> through imported templates (matching elements in the w: namespace), for "extraction" output. -->
......
...@@ -9,6 +9,11 @@ ...@@ -9,6 +9,11 @@
xmlns="http://www.w3.org/1999/xhtml" xmlns:xsw="http://coko.foundation/xsweet" xmlns="http://www.w3.org/1999/xhtml" xmlns:xsw="http://coko.foundation/xsweet"
exclude-result-prefixes="#all"> exclude-result-prefixes="#all">
<!-- XSweet: step 1 of docx extraction - pulling the main text, notes and styles.... [3a] -->
<!-- Input: a WordML document.xml file as extracted from .docx input, with its related (neighbor) files in place -->
<!-- Output: Spammy HTML, pretty cruddy, expect to perform cleanup ... -->
<!-- For docs on WordML, see (at least): <!-- For docs on WordML, see (at least):
http://webapp.docx4java.org/OnlineDemo/ecma376/WordML/index.html http://webapp.docx4java.org/OnlineDemo/ecma376/WordML/index.html
......
...@@ -10,6 +10,9 @@ ...@@ -10,6 +10,9 @@
<!-- Indent should really be no, but for testing. --> <!-- Indent should really be no, but for testing. -->
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/> <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- XSweet: notes cleanup, step 2 of regular docx extraction .... [3b] -->
<!-- Input: A messy noisy HTML document straight out of docx-extract.xsl -->
<!-- Output: A copy, with some regularization with respect specifically to footnotes and endnotes ... -->
<xsl:param as="xs:string" name="footnote-format">a</xsl:param> <xsl:param as="xs:string" name="footnote-format">a</xsl:param>
<xsl:param as="xs:string" name="endnote-format" >1</xsl:param> <xsl:param as="xs:string" name="endnote-format" >1</xsl:param>
......
...@@ -11,6 +11,11 @@ ...@@ -11,6 +11,11 @@
<!-- Indent should really be no, but for testing. --> <!-- Indent should really be no, but for testing. -->
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/> <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- XSweet: Further reduces haphazard redundancy in markup by joining adjacent elements with similar properties .... [3d] -->
<!-- Input: A messy noisy HTML document needing (yet more) streamlining and cleanup. -->
<!-- Output: A copy, with improvements. -->
<!-- Copy everything by default. --> <!-- Copy everything by default. -->
<xsl:template match="node() | @*"> <xsl:template match="node() | @*">
<xsl:copy> <xsl:copy>
......
#### EXTRACT-docx.xsl
XSLT stylesheet version 3.0 (5 templates)
XSweet: 'wrapper' XSLT for docx extraction with cleanup; using this XSLT in a 3.0 processor replaces five calls to distinct XSLTs. [1]
Input: a WordML document.xml file as extracted from .docx input, with its related (neighbor) files in place
Output: HTML Typescript - fairly clean and regular HTML
Declared dependency: `docx-html-extract.xsl`
Declared dependency: `handle-notes.xsl`
Declared dependency: `scrub.xsl`
Declared dependency: `join-elements.xsl`
Declared dependency: `collapse-paragraphs.xsl`
#### docx-html-extract.xsl
XSLT stylesheet version 3.0 (72 templates)
XSweet: step 1 of docx extraction - pulling the main text, notes and styles.... [3a]
Input: a WordML document.xml file as extracted from .docx input, with its related (neighbor) files in place
Output: Spammy HTML, pretty cruddy, expect to perform cleanup ...
Compile-time dependency (xsl:include) `docx-table-extract.xsl`
#### handle-notes.xsl
XSLT stylesheet version 3.0 (8 templates)
XSweet: notes cleanup, step 2 of regular docx extraction .... [3b]
Input: A messy noisy HTML document straight out of docx-extract.xsl
Output: A copy, with some regularization with respect specifically to footnotes and endnotes ...
Runtime parameter ``footnote-format`` as xs:string
Runtime parameter ``endnote-format`` as xs:string
#### scrub.xsl
XSLT stylesheet version 3.0 (9 templates)
XSweet: "Scrub" cleanup in service of docx-extraction, usually step 3 .... [3c]
Input: A messy noisy HTML document needing streamlining and cleanup.
Output: A copy, with improvements.
Note: the rule in the extraction XSLT is "make an element for anything" even if it hasn't been mapped - this step has a chance to wipe this up, and does so for certain elements known to be innocuous. Occasionally new such elements may need to be matched in this XSLT (detect them by invalid HTML downstream, with unknown element types).
#### join-elements.xsl
XSLT stylesheet version 3.0 (8 templates)
XSweet: Further reduces haphazard redundancy in markup by joining adjacent elements with similar properties .... [3d]
Input: A messy noisy HTML document needing (yet more) streamlining and cleanup.
Output: A copy, with improvements.
#### collapse-paragraphs.xsl
XSLT stylesheet version 3.0 (9 templates)
XSweet: Further removal of redundant expression of formatting properties, especially in service of subsequent heuristics (where we need to see properties on paragraphs, not only their contents objects) .... [3e]
Input: A messy noisy HTML document needing (yet more and even more) streamlining and cleanup.
Output: A copy, with improvements.
#### docx-html-extract-mini.xsl
XSLT stylesheet version 3.0 (61 templates)
XSweet: an EXPERIMENTAL single-pass reduced docx extraction for further development. [ZZ]
Compile-time dependency (xsl:include) `docx-table-extract.xsl`
#### docx-html-extract-old.xsl
XSLT stylesheet version 2.0 (55 templates)
#### docx-html-extract-saxon-shell.xsl
XSLT stylesheet version 2.0 (1 template)
Compile-time dependency (xsl:import) `docx-html-extract.xsl`
Runtime parameter ``docx-file-uri`` as xs:string
Runtime parameter ``show-css`` as xs:string
#### docx-table-extract.xsl
XSLT stylesheet version 3.0 (14 templates)
#### quickndirty2.xsl
XSLT stylesheet version 2.0 (21 templates)
XSweet: one of the earliest docx extraction XSLTs, kept here for historical reasons. It's standalone!
Runtime parameter ``show-css`` as xs:string
#### docx-document-production.xpl
XProc pipeline version 1.0 (6 steps)
Runtime dependency: `docx-html-extract.xsl`
Runtime dependency: `handle-notes.xsl`
Runtime dependency: `scrub.xsl`
Runtime dependency: `join-elements.xsl`
Runtime dependency: `collapse-paragraphs.xsl`
\ No newline at end of file
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
xmlns:xsw="http://coko.foundation/xsweet" xmlns:xsw="http://coko.foundation/xsweet"
exclude-result-prefixes="#all"> exclude-result-prefixes="#all">
<!-- XSweet: one of the earliest docx extraction XSLTs, kept here for historical reasons. It's standalone! -->
<!-- Indent should really be no, but for testing. --> <!-- Indent should really be no, but for testing. -->
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/> <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
......
...@@ -10,6 +10,13 @@ ...@@ -10,6 +10,13 @@
<!-- Indent should really be no, but for testing. --> <!-- Indent should really be no, but for testing. -->
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/> <xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<!-- XSweet: "Scrub" cleanup in service of docx-extraction, usually step 3 .... [3c] -->
<!-- Input: A messy noisy HTML document needing streamlining and cleanup. -->
<!-- Output: A copy, with improvements. -->
<!-- Note: the rule in the extraction XSLT is "make an element for anything" even if it hasn't been mapped - this step has a chance to wipe this up, and does so for certain elements known to be innocuous. Occasionally new such elements may need to be matched in this XSLT (detect them by invalid HTML downstream, with unknown element types). -->
<!-- Copy everything by default. --> <!-- Copy everything by default. -->
<xsl:template match="node() | @*"> <xsl:template match="node() | @*">
<xsl:copy> <xsl:copy>
......
...@@ -75,7 +75,7 @@ ...@@ -75,7 +75,7 @@
<xsl:apply-templates mode="#current"/> <xsl:apply-templates mode="#current"/>
</xsl:template> </xsl:template>
<xsl:template match="comment()[matches(.,'^\s*(XSweet|Input|Output|Note):')]" mode="report"> <xsl:template match="comment()[matches(.,'^\s*(XSweet|Input|Output|Note|Limitations?):')]" mode="report">
<p> <p>
<xsl:value-of select="normalize-space(.)"/> <xsl:value-of select="normalize-space(.)"/>
</p> </p>
......
#### directory-manifest.xsl
XSLT stylesheet version 3.0 (13 templates)
Runtime parameter ``dirpath`` as xs:string
#### docx-query.xsl
XSLT stylesheet version 2.0 (1 template)
Runtime parameter ``docx-file-uri`` as xs:string
#### html-to-markdown.xsl
XSLT stylesheet version 3.0 (13 templates)
#### manifest-reorder.xsl
XSLT stylesheet version 3.0 (2 templates)
#### directory-manifest-produce.xpl
XProc pipeline version 1.0 (3 steps)
Runtime dependency: `directory-manifest.xsl`
Runtime dependency: `directory-manifest.xsl`
Runtime dependency: `manifest-reorder.xsl`
Runtime dependency: `html-to-markdown.xsl`
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment