Commit b86821af authored by Wendell Piez's avatar Wendell Piez
Browse files

Added quote-mark handling logic (whee) to Editoria tune filter

parent 2ccc4d02
......@@ -16,19 +16,78 @@
<!-- XSweet: provides a "bridge filter" for final tuning of HTML contents; a generalized sub-editorial preprocessor supporting string replacement. [3b] -->
<!-- Input: HTML -->
<!-- Output: A copy of the input, with text munges and other "tuning" -->
<!-- Limitation: doesn't handle overlap; any inline markup splitting strings will also inhibit string replacement. -->
<!-- Limitation: also, doesn't discriminate between ws that is "safe to munge" (eg paragraph content) and "significant" ws (eg code blocks or ASCII art): this will wipe it all. -->
<!-- Limitation: Doesn't discriminate between ws that is "safe to munge" (eg paragraph content) and "significant" ws (eg code blocks or ASCII art): this will treat all text indiscriminately. -->
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<xsl:output method="xml" indent="yes" omit-xml-declaration="yes"/>
<xsl:mode on-no-match="shallow-copy"/>
<!-- Available xsw: element semantics
xsw:match - matches a substring using a regular expression. A replacement string appears on @replace.
It can have replacement expressions.
xsw:match with @when-after or @when-next - qualifies the match so it only happens when the text preceding
the matched node (in the same paragraph) or following it also matches a given regular expression.
This helps matching across element boundaries.
Note that in this case the replacement doesn't touch the substring matched in the preceding or following text.
xsw:match-first and xsw:match-last
Like xsw:match and working similarly, but only working on the first (leading) or last (trailing) bit of text.
xsw:message Emit an XSLT runtime message (for debugging)
-->
<!-- The replacements will be made in order so earlier matches take precedence. -->
<xsl:variable name="replacements">
<xsw:match replace="&#x2026;" >\.\.\.</xsw:match> <!-- replace three dots with horizontal ellipsis -->
<xsw:match replace="&#x2013;$1">-(\d)</xsw:match> <!-- replace hyphen+digit with en dash + digit -->
<xsw:match replace="&#x20;" >\s+</xsw:match> <!-- replace runs of spaces with single space -->
<xsw:match replace="&#x2014;" >\s*&#x2014;\s*</xsw:match><!-- omit whitespace around em dashes -->
<!-- $space is (plain) space and LF, no tab -->
<xsl:variable as="xs:string" name="space">[&#x20;&#xA;]</xsl:variable>
<xsl:variable name="replacements" expand-text="yes" as="element(xsw:sequence)">
<xsw:sequence>
<!-- Two adjacent hyphens become an em dash: "-\-" -> "—"-->
<xsw:match replace="&#x2014;">\-\-</xsw:match>
<!-- An en dash surround on both sides by spaces should be converted to an em dash: " – " -> " — "-->
<!-- nb the padding ws is removed -->
<xsw:match replace="&#x2014;">\s+&#x2013;\s+</xsw:match>
<!-- Spaces touching tabs should be removed -->
<!-- (Runs of tabs might remain where they were mixed with spaces.) -->
<xsw:match replace="&#x9;">{$space}*&#x9;+{$space}*</xsw:match>
<!-- Replace runs of multiple consecutive tabs with just one tab-->
<xsw:match replace="&#x9;">&#x9;+</xsw:match>
<!-- Equal signs should be surrounded on either side by one and only one space: " = "-->
<!-- (First we pad, then we remove extra.)-->
<xsw:match replace=" = ">=</xsw:match>
<xsw:match replace=" = ">\s+=\s+</xsw:match>
<!-- Remove spaces at the very beginning and ends of ps-->
<!-- 'match-first' is a no-op except for at beginnings of ps -->
<xsw:match-first>^{$space}+</xsw:match-first>
<!-- (Doesn't strip tabs) -->
<!-- Remove tabs that end a paragraph (not ones that start)-->
<!-- 'match-last' is a no-op except for at ends of ps -->
<xsw:match-last>\s+$</xsw:match-last>
<!-- (Removes all spaces including tabs) -->
<!-- replace three dots with horizontal ellipsis -->
<xsw:match replace="&#x2026;">\.\.\.</xsw:match>
<!-- replace hyphen+digit with en dash + digit -->
<xsw:match replace="&#x2013;$1">-(\d)</xsw:match>
<!-- Replace runs of multiple consecutive spaces with just one space-->
<xsw:match replace="&#x20;">{$space}+</xsw:match>
<!-- omit whitespace around em dashes -->
<xsw:match replace="&#x2014;">\s*&#x2014;\s*</xsw:match>
<!-- subsequence to perform all quotation mark munging -->
<xsw:munge-quotes/>
</xsw:sequence>
</xsl:variable>
<xsl:template match="h4 | h5 | h6">
......@@ -38,15 +97,210 @@
</h3>
</xsl:template>
<xsl:template match="text()">
<xsl:variable name="str" select="string(.)"/>
<xsl:iterate select="$replacements/*">
<xsl:param name="str" select="$str" as="xs:string"/>
<xsl:template match="body//text()">
<xsl:call-template name="call-sequence">
<xsl:with-param name="original" select="."/>
<xsl:with-param name="sequence" select="$replacements"/>
</xsl:call-template>
<!--<xsl:iterate select="$replacements/*">
<xsl:param name="original" select="." as="text()"/>
<xsl:param name="str" select="string(.)" as="xs:string"/>
<xsl:on-completion select="$str"/>
<xsl:next-iteration>
<xsl:with-param name="str" select="replace($str,string(.),@replace)"/>
<xsl:with-param name="original" select="$original"/>
<xsl:with-param name="str">
<xsl:apply-templates select=".">
<xsl:with-param name="original" select="$original"/>
<xsl:with-param name="str" select="$str"/>
</xsl:apply-templates>
</xsl:with-param>
</xsl:next-iteration>
</xsl:iterate>-->
</xsl:template>
<xsl:template name="call-sequence" as="xs:string">
<xsl:param name="original" as="text()"/>
<xsl:param name="starting" as="xs:string" select="string($original)"/>
<xsl:param name="sequence" as="element(xsw:sequence)"/>
<xsl:iterate select="$sequence/*">
<xsl:param name="original" select="$original" as="text()"/>
<xsl:param name="str" select="$starting" as="xs:string"/>
<xsl:on-completion select="$str"/>
<xsl:next-iteration>
<xsl:with-param name="original" select="$original"/>
<xsl:with-param name="str">
<xsl:apply-templates select=".">
<xsl:with-param name="original" select="$original"/>
<xsl:with-param name="str" select="$str"/>
</xsl:apply-templates>
</xsl:with-param>
</xsl:next-iteration>
</xsl:iterate>
</xsl:template>
<xsl:template match="xsw:match">
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:sequence select="replace($str,string(.),(@replace,'')[1])"/>
</xsl:template>
<xsl:template match="xsw:match[exists(@when-next)]">
<xsl:param name="original" required="yes" as="text()"/>
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:variable name="regex" select=". || '$'"/>
<xsl:variable name="after" select="string-join($original/ancestor::p[1]/descendant::text()[. >> $original],'')"/>
<xsl:variable name="after-regex" select="'^' || @when-next"/>
<xsl:choose>
<xsl:when test="matches($after,$after-regex)">
<!--<xsl:sequence select="$str"/> ... <xsl:sequence select="$regex"/>-->
<!--<xsl:sequence select="string(@replace)"/>-->
<xsl:sequence select="replace($str,$regex,(@replace,'')[1])"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="xsw:match[exists(@when-after)]">
<xsl:param name="original" required="yes" as="text()"/>
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:variable name="regex" select="'^' || ."/>
<xsl:variable name="next" select="string-join($original/ancestor::p[1]/descendant::text()[. &lt;&lt; $original],'')"/>
<xsl:variable name="next-regex" select="@when-after || '$'"/>
<xsl:choose>
<xsl:when test="matches($next,$next-regex)">
<!--<xsl:sequence select="$str"/> ... <xsl:sequence select="$regex"/>-->
<!--<xsl:sequence select="string(@replace)"/>-->
<xsl:sequence select="replace($str,$regex,(@replace,'')[1])"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template priority="5" match="xsw:match[exists(@when-next)][exists(@when-after)]">
<xsl:param name="original" required="yes" as="text()"/>
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:variable name="regex" select="'^' || . || '$'"/>
<xsl:variable name="next" select="string-join($original/ancestor::p[1]/descendant::text()[. &lt;&lt; $original],'')"/>
<xsl:variable name="next-regex" select="@when-after || '$'"/>
<xsl:variable name="after" select="string-join($original/ancestor::p[1]/descendant::text()[. >> $original],'')"/>
<xsl:variable name="after-regex" select="'^' || @when-next"/>
<xsl:choose>
<xsl:when test="matches($next,$next-regex) and matches($next,$next-regex)">
<!--<xsl:sequence select="$str"/> ... <xsl:sequence select="$regex"/>-->
<!--<xsl:sequence select="string(@replace)"/>-->
<xsl:sequence select="replace($str,$regex,(@replace,'')[1])"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="xsw:match-first">
<xsl:param name="original" required="yes" as="text()"/>
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:variable name="where" select="$original/ancestor::p[1]"/>
<xsl:choose>
<xsl:when test="$original is $where/descendant::text()[1]">
<xsl:sequence select="replace($str,string(.),(@replace,'')[1])"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="xsw:match-last">
<xsl:param name="original" required="yes" as="text()"/>
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:variable name="where" select="$original/ancestor::p[1]"/>
<xsl:choose>
<xsl:when test="$original is $where/descendant::text()[last()]">
<xsl:sequence select="replace($str,string(.),(@replace,'')[1])"/>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="$str"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:variable name="quot" as="xs:string">"</xsl:variable>
<xsl:variable name="apos" as="xs:string">'</xsl:variable>
<xsl:template match="xsw:munge-quotes">
<xsl:param name="original" required="yes" as="text()"/>
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:call-template name="call-sequence">
<xsl:with-param name="starting" select="$str"/>
<xsl:with-param name="original" select="$original"/>
<xsl:with-param name="sequence" as="element(xsw:sequence)" expand-text="true">
<xsw:sequence>
<!-- Alex's sequence - first, reduce all left- and right-facing quotations marks with their 'straight' analog
u2018 and u2019 -> u0027
u201c and u201d -> u0022
also ` and `` to their respective u0027 and u0022
-->
<xsw:match replace="{$apos}" >[&#x2018;&#x2019;]</xsw:match>
<xsw:match replace="{$quot}" >[&#x201c;&#x201d;]</xsw:match>
<!--
Then apply heuristics to map back again -
apostrophe+alphabetical character (u0027+letter) -> left single quotation mark (u2018+letter)
alphabetical character+apostrophe (letter+u0027( -> alphabetical character+right single quotation mark (letter+u2019)
quotation mark+alphabetical character (u0022+letter) -> left double quotation mark+alphabetical character (u201c+letter)
alphabetical character+quotation mark (letter+u0022) -> alphabetical character+right double quotation mark (letter+u201d)
-->
<xsw:match replace="&#x201c;$1">"(\S)</xsw:match>
<xsw:match when-next="\S" replace="&#x201c;">"</xsw:match>
<xsw:match when-after="\S" replace="&#x201d;">"</xsw:match>
<xsw:match replace="$1&#x201d;">(\S)["&#x201c;]</xsw:match>
<xsw:match replace="&#x2018;$1">'(\S)</xsw:match>
<xsw:match replace="$1&#x2019;">(\S)['&#x2018;]</xsw:match>
<!-- brute s/r (from spec)
em dash+right double quote (u2014+u201d) -> em dash+left double quote (u2014+u201c)
left double quote+em dash (u201c+u2014)-> right double quote+em dash (u201d+u2014)
" 'em" or " ‘em" (space+u0027+"em" or space+u2019+"em") -> " ’em" (space+u2019+"em")
"'n'" or "'n'" (u0027+"n"+u0027 or u2018+"n"+u2018) -> "’n’" (u2019+"n"+u2019)"
" 'tis" (space+u0027+"tis" or space+u2018+"tis") -> " ’tis" (space+u2019+"tis")-->
<xsw:match replace="&#x2019;em">['&#x2018;]em</xsw:match>
<xsw:match replace="&#x2019;n&#x2019;">['&#x2018;&#x2019;]n['&#x2018;&#x2019;]</xsw:match>
<xsw:match replace="&#x2019;tis">['&#x2018;]tis</xsw:match>
</xsw:sequence>
</xsl:with-param>
</xsl:call-template>
</xsl:template>
<xsl:template match="xsw:*">
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:sequence select="$str"/>
</xsl:template>
<xsl:template match="xsw:message">
<xsl:param name="str" required="yes" as="xs:string"/>
<xsl:message>
<xsl:value-of select="."/>
</xsl:message>
<xsl:sequence select="$str"/>
</xsl:template>
</xsl:stylesheet>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment