Commit dd344088 authored by Wendell Piez's avatar Wendell Piez

added initial versions of XSLT

parent 26845a44
<?xml version="1.0" encoding="UTF-8"?>
<p:declare-step xmlns:p="http://www.w3.org/ns/xproc"
xmlns:c="http://www.w3.org/ns/xproc-step" version="1.0">
<p:input port="parameters" kind="parameter"/>
<p:option name="docx-file-uri" required="true"/>
<p:output port="result">
<p:pipe port="result" step="final"/>
</p:output>
<p:output port="interimA" primary="false">
<p:pipe port="result" step="slops-extracted"/>
</p:output>
<p:output port="interimB" primary="false">
<p:pipe port="result" step="scrubbed"/>
</p:output>
<p:output port="interimC" primary="false">
<p:pipe port="result" step="collapsed"/>
</p:output>
<p:serialization port="result" indent="true" omit-xml-declaration="true"/>
<p:serialization port="interimA" indent="true" omit-xml-declaration="true"/>
<p:serialization port="interimB" indent="true" omit-xml-declaration="true"/>
<p:serialization port="interimC" indent="true" omit-xml-declaration="true"/>
<p:variable name="document-path" select="concat('jar:',$docx-file-uri,'!/word/document.xml')"/>
<!--<p:variable name="document-xml" select="doc($document-path)"/>-->
<!-- Validate HTML5 results here: http://validator.w3.org/nu/ -->
<p:load>
<p:with-option name="href" select="$document-path"/>
</p:load>
<p:xslt name="slops-extracted">
<p:input port="stylesheet">
<p:document href="quickndirty2.xsl"/>
</p:input>
<p:with-param name="show-css" select="'yes'"/>
</p:xslt>
<p:xslt name="scrubbed">
<p:input port="stylesheet">
<p:document href="scrub.xsl"/>
</p:input>
</p:xslt>
<p:xslt name="collapsed">
<p:input port="stylesheet">
<p:document href="join-elements.xsl"/>
</p:input>
</p:xslt>
<p:identity name="final"/>
</p:declare-step>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
exclude-result-prefixes="#all"
version="2.0">
<xsl:import href="quickndirty2.xsl"/>
<!-- A 'shell' stylesheet, permitting us to pass a .docx file as an input parameter,
using Java to retrieve the document.xml from inside it and process that file
through imported templates (matching elements in the w: namespace), for "extraction" output. -->
<!-- The full path (URI) to the input docx must be passed at runtime. -->
<xsl:param as="xs:string" name="docx-file-uri" required="yes"/>
<!-- Overriding imported binding yes|no -->
<xsl:param as="xs:string" name="show-css">yes</xsl:param>
<xsl:output indent="yes" omit-xml-declaration="yes"/>
<xsl:variable name="document-path" select="concat('jar:',$docx-file-uri,'!/word/document.xml')"/>
<xsl:variable name="document-xml" select="document($document-path)"/>
<xsl:template match="/" name="extract">
<!-- Grabbing the document element of document.xml; imported templates will take over. -->
<xsl:apply-templates select="$document-xml/*"/>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:coko="http://coko.foundation/xslt/wordml/util"
exclude-result-prefixes="#all">
<xsl:template match="node() | @*">
<xsl:copy>
<xsl:apply-templates select="node() | @*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="p">
<xsl:call-template name="collapse-ilk"/>
</xsl:template>
<xsl:template name="collapse-ilk">
<xsl:param name="among" select="node()"/>
<xsl:for-each-group select="$among" group-adjacent="coko:node-hash(.)">
<xsl:for-each select="current-group()[1]/self::*">
<!-- In the element case, splice in an element. -->
<xsl:copy>
<xsl:copy-of select="@*"/>
<xsl:call-template name="collapse-ilk">
<xsl:with-param name="among" select="current-group()/node()"/>
</xsl:call-template>
</xsl:copy>
</xsl:for-each>
<!-- Splice in anything not an element. -->
<xsl:copy-of select="current-group()[empty(self::*)]"/>
</xsl:for-each-group>
</xsl:template>
<xsl:function name="coko:node-hash" as="xs:string">
<xsl:param name="n" as="node()"/>
<xsl:value-of separator="|">
<xsl:apply-templates select="$n" mode="hash"/>
</xsl:value-of>
</xsl:function>
<xsl:template match="*" mode="hash">
<xsl:value-of select="local-name()"/>
<xsl:apply-templates mode="#current" select="@*">
<xsl:sort select="local-name()"/>
</xsl:apply-templates>
</xsl:template>
<xsl:template match="@*" mode="hash">
<xsl:value-of select="local-name(),." separator=":"/>
</xsl:template>
<xsl:template match="div" mode="hash">
<xsl:value-of select="generate-id()"/>
</xsl:template>
<xsl:template match="text() | comment() | processing-instruction()" mode="hash"/>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:coko="http://coko.foundation/xslt/wordml/util"
exclude-result-prefixes="#all">
<xsl:template match="node() | @*">
<xsl:copy>
<xsl:apply-templates select="node() | @*"/>
</xsl:copy>
</xsl:template>
<xsl:template match="p[not(matches(.,'\S'))]"/>
<xsl:template match="span[empty(@* except @style)]">
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="@style"/>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
version="2.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:coko="http://coko.foundation/xslt/wordml/util"
exclude-result-prefixes="#all">
<xsl:output indent="yes" omit-xml-declaration="yes"/>
<xsl:param as="xs:string" name="show-css">yes</xsl:param>
<!-- Turn $show-css to 'yes' to switch on $css-reflect. -->
<!-- $show-css supplements the traversal with @style markers wherever certain
kinds of formatting (e.g. font shift indicators including the spurious font shifts
left in by word processors) are indicated in the text; it can be very noisy. -->
<xsl:variable as="xs:boolean" name="css-reflect" select="$show-css='yes'"/>
<!-- Run on 'document.xml' inside a .docx -->
<xsl:template match="/w:document">
<html>
<head>
<meta charset="UTF-8"/>
</head>
<xsl:apply-templates select="w:body"/>
</html>
</xsl:template>
<xsl:template match="w:body">
<body>
<xsl:apply-templates select="w:p"/>
</body>
</xsl:template>
<!-- //w:p/w:pPr/w:pStyle -->
<xsl:template match="w:p">
<p>
<xsl:for-each select="w:pPr/w:pStyle">
<xsl:attribute name="class" select="@w:val"/>
</xsl:for-each>
<xsl:apply-templates select="w:r"/>
</p>
</xsl:template>
<xsl:function name="coko:css-literal" as="xs:string?">
<xsl:param name="run" as="element(w:r)"/>
<xsl:if test="$css-reflect">
<xsl:apply-templates select="$run/w:rPr" mode="render-css"/>
</xsl:if>
</xsl:function>
<xsl:template match="w:r[matches(coko:css-literal(.), '\S')]">
<span style="{normalize-space(coko:css-literal(.))}">
<xsl:call-template name="format-components"/>
</span>
</xsl:template>
<xsl:template match="w:r">
<xsl:call-template name="format-components"/>
</xsl:template>
<xsl:template name="format-components">
<xsl:for-each-group select="* except w:rPr" group-adjacent="coko:has-format(.)">
<!-- current-grouping-key() is always true for some elements, and true for all when
there is no w:rPr. The effect of the group-adjacent is to "bundle" elements
to be wrapped in formatting, or not, depending on the element type. For example,
footnote callouts that are expanded to footnotes are not wrapped, lest formatting
for the callout be wrapped around the footnote in the result. -->
<xsl:choose>
<xsl:when test="current-grouping-key()">
<!-- when the stuff is to be formatted, traverse to w:rPr carrying the group through. -->
<xsl:apply-templates select="../w:rPr">
<xsl:with-param name="contents" tunnel="yes">
<xsl:apply-templates select="current-group()"/>
</xsl:with-param>
</xsl:apply-templates>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates select="current-group()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each-group>
</xsl:template>
<xsl:variable name="endnotes-doc" select="document('endnotes.xml',/)"/>
<xsl:variable name="footnotes-doc" select="document('footnotes.xml',/)"/>
<xsl:key name="endnotes-by-id" match="w:endnote" use="@w:id"/>
<xsl:key name="footnotes-by-id" match="w:footnote" use="@w:id"/>
<xsl:template match="w:endnoteReference" priority="3">
<div class="endnote_fetched">
<xsl:apply-templates select="key('endnotes-by-id',@w:id,$endnotes-doc)"/>
</div>
</xsl:template>
<!-- Again overriding the default behavior for w:r/*, to the same effect. -->
<xsl:template match="w:footnoteReference" priority="3">
<div class="footnote_fetched">
<xsl:apply-templates select="key('footnotes-by-id',@w:id,$footnotes-doc)"/>
</div>
</xsl:template>
<!-- w:rPr works by pushing its contents through its children one at a time
in sibling succession, given them each an opportunity to wrap the results. -->
<!-- Individual templates matching w:rPr/* provide for the particular mappings into HTML. -->
<xsl:template match="w:rPr">
<xsl:param name="contents" select="()" tunnel="yes"/>
<xsl:apply-templates select="*[1]">
<!-- Tunneling <xsl:with-param name="contents" select="$contents"/>-->
</xsl:apply-templates>
<xsl:if test="empty(*)">
<xsl:sequence select="$contents"/>
</xsl:if>
</xsl:template>
<!-- Look ma! no modes! children of w:rPr perform a *sibling traversal*
in order to wrap themselves sequentially in HTML (inline) wrappers. -->
<!-- xsl:template/@priority must be used to assure a better match than the default. -->
<!-- By default we name an element after its tag in Word ML (w: namespace). -->
<xsl:template match="w:rPr/*">
<xsl:element name="{local-name()}">
<xsl:call-template name="tuck-next"/>
</xsl:element>
</xsl:template>
<xsl:template priority="5" match="w:rPr/w:bCs">
<!-- https://msdn.microsoft.com/en-us/library/documentformat.openxml.wordprocessing.boldcomplexscript(v=office.14).aspx -->
<b class="bCs">
<xsl:call-template name="tuck-next"/>
</b>
</xsl:template>
<!-- When there's an inline style, announce it. -->
<xsl:template priority="5" match="w:rPr/w:rStyle">
<span class="{@w:val}">
<xsl:call-template name="tuck-next"/>
</span>
</xsl:template>
<!-- This should match any formatting we don't wish to see among wrapped inline elements;
note that the same formatting properties may be detected in/by CSS reflection instead. -->
<xsl:template priority="5" match="w:rPr/w:sz | w:rPr/w:szCs | w:rPr/w:rFonts | w:wPr/w:color">
<!-- Just do the next one. -->
<xsl:call-template name="tuck-next"/>
</xsl:template>
<!-- Called to effect the sibling traversal among w:rPrr/* elements. -->
<xsl:template name="tuck-next">
<xsl:param name="contents" select="()" tunnel="yes"/>
<!-- If there's more format, keep going. -->
<xsl:apply-templates select="following-sibling::*[1]"/>
<!-- If not go back to get the text. -->
<xsl:if test="empty(following-sibling::*)">
<xsl:sequence select="$contents"/>
</xsl:if>
</xsl:template>
<xsl:function name="coko:has-format" as="xs:boolean">
<xsl:param name="n" as="node()"/>
<xsl:variable name="n-is-callout" as="xs:boolean">
<xsl:apply-templates select="$n" mode="is-callout"/>
</xsl:variable>
<xsl:sequence select="exists($n/../w:rPr) and not($n-is-callout)"/>
</xsl:function>
<!-- Since we don't want to see these wrapped in formatting ... -->
<xsl:template match="w:footnoteReference | w:endnoteReference" mode="is-callout" as="xs:boolean">
<xsl:sequence select="true()"/>
</xsl:template>
<xsl:template match="*" mode="is-callout" as="xs:boolean">
<xsl:sequence select="false()"/>
</xsl:template>
<xsl:template match="*" mode="render-css"/>
<xsl:template mode="render-css" match="w:rPr">
<xsl:value-of separator="; ">
<xsl:apply-templates mode="#current"/>
</xsl:value-of><!---->
</xsl:template>
<xsl:template mode="render-css" as="xs:string" match="w:rFonts">
<xsl:value-of>
<xsl:text>font-family: </xsl:text>
<xsl:value-of select="@w:ascii"/>
</xsl:value-of>
</xsl:template>
<xsl:template mode="render-css" as="xs:string" match="w:sz | w:szCs">
<xsl:value-of>
<xsl:text>font-size: </xsl:text>
<xsl:value-of select="@w:val"/>
</xsl:value-of>
</xsl:template>
<xsl:template mode="render-css" as="xs:string" match="w:color">
<xsl:value-of>
<xsl:text>color: </xsl:text>
<xsl:value-of select="@w:val/replace(.,'^\d','#$0')"/>
</xsl:value-of>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment