Skip to content

Commit 22f5994

Browse files
committed
feat: check "data-*" attributes name restrictions
According to HTML: > A custom data attribute is an attribute in no namespace whose name > starts with the string "data-", has at least one character after the > hyphen, is XML-compatible, and contains no ASCII upper alphas. This commit reports invalid `data-*` attributes as errors with a new error code, `HTM-061`. This should be ideally delegated to validator.nu, but we add this check until we fully integrate the HTML checker. Close #1107
1 parent 6a3d2a6 commit 22f5994

File tree

7 files changed

+47
-1
lines changed

7 files changed

+47
-1
lines changed

src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ private void initialize()
145145
severities.put(MessageId.HTM_059, Severity.ERROR);
146146
severities.put(MessageId.HTM_060a, Severity.USAGE);
147147
severities.put(MessageId.HTM_060b, Severity.USAGE);
148+
severities.put(MessageId.HTM_061, Severity.ERROR);
148149

149150
// Media
150151
severities.put(MessageId.MED_001, Severity.SUPPRESSED);

src/main/java/com/adobe/epubcheck/messages/MessageId.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ public enum MessageId implements Comparable<MessageId>
139139
HTM_059("HTM_059"),
140140
HTM_060a("HTM_060a"),
141141
HTM_060b("HTM_060b"),
142+
HTM_061("HTM_061"),
142143

143144
// Messages associated with media (images, audio and video)
144145
MED_001("MED-001"),

src/main/java/com/adobe/epubcheck/xml/HTMLUtils.java

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import com.google.common.base.Preconditions;
66
import com.google.common.collect.ImmutableSet;
77

8+
import net.sf.saxon.om.NameChecker;
9+
810
/**
911
* Utilities for HTML-specific logic.
1012
*
@@ -44,7 +46,7 @@ public static boolean isCustomNamespace(String namespace)
4446
* attributes.
4547
*
4648
* @param name
47-
* the name of an attribute defined in the HTML specification
49+
* the name of an attribute defined in the HTML specification
4850
* @return <code>true</code> iff the attribute value is case-insensitive
4951
*/
5052
public static boolean isCaseInsensitiveAttribute(String namespace, String name)
@@ -57,6 +59,25 @@ public static boolean isDataAttribute(String namespace, String name)
5759
return namespace.isEmpty() && name.startsWith("data-");
5860
}
5961

62+
/**
63+
* Tells if a string is a valid <a href=
64+
* "https://html.spec.whatwg.org/multipage/dom.html#custom-data-attribute">
65+
* custom data attribute</a>, as
66+
* defined in HTML.
67+
*
68+
* @param name
69+
* the data attribute to test
70+
* @return true if {@code name} is a valid custom data attribute
71+
*/
72+
public static boolean isValidDataAttribute(String name)
73+
{
74+
Preconditions.checkArgument(name != null && name.startsWith("data-"));
75+
name = name.substring(5);
76+
return !name.isEmpty()
77+
&& NameChecker.isValidNCName(name)
78+
&& !name.matches(".*[A-Z].*");
79+
}
80+
6081
private HTMLUtils()
6182
{
6283
// Not instanciable.

src/main/java/com/adobe/epubcheck/xml/handlers/PreprocessingDefaultHandler.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,11 @@ private Attributes preprocessAttributes(String elemNamespace, Attributes atts)
8686
// Remove data-* attributes in both XHTML and SVG
8787
if (HTMLUtils.isDataAttribute(namespace, name))
8888
{
89+
if (!HTMLUtils.isValidDataAttribute(name))
90+
{
91+
context.report.message(MessageId.HTM_061, LocationHandler.location(context, locator),
92+
name);
93+
}
8994
attributes.removeAttribute(i);
9095
}
9196
// Remove custom namespace attributes in XHTML

src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ HTM_058=HTML documents must be encoded in UTF-8, but UTF-16 was detected.
6464
HTM_059=Viewport "%1$s" property must not be defined more than once, but found values [%2$s].
6565
HTM_060a=EPUB reading systems must ignore secondary viewport meta elements in fixed-layout documents; viewport declaration "%1$s" will be ignored.
6666
HTM_060b=EPUB reading systems must ignore viewport meta elements in reflowable documents; viewport declaration "%1$s" will be ignored.
67+
HTM_061="%1$s" is not a valid custom data attribute (it must have at least one character after the hyphen, be XML-compatible, and not contain ASCII uppercase letters).
6768

6869
#media
6970
MED_003=Picture "img" elements must reference core media type resources, but found resource "%1$s" of type "%2$s".

src/test/resources/epub3/06-content-document/content-document-xhtml.feature

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,11 @@ Feature: EPUB 3 — Content Documents — XHTML
227227
When checking document 'data-attr-valid.xhtml'
228228
Then no errors or warnings are reported
229229

230+
Scenario: Report invalid `data-*` attributes
231+
When checking document 'data-attr-invalid-error.xhtml'
232+
Then error HTM-061 is reported 3 times
233+
And no other errors or warnings are reported
234+
230235
Scenario: Report invalid elements after a `data-*` attribute
231236
See issue 189 - was allowed by stripping of `data-*` attributes
232237
When checking EPUB 'content-xhtml-data-attr-removal-markup-error'
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<!DOCTYPE html>
2+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
3+
<head>
4+
<meta charset="utf-8" />
5+
<title>data-* attributes</title>
6+
</head>
7+
<body>
8+
<div data-="">invalid (no character after hte hyphen)</div>
9+
<div data--test="">invalid (not an XML name)</div>
10+
<div data-ERR="">invalid (contains upper alphas)</div>
11+
</body>
12+
</html>

0 commit comments

Comments
 (0)