Skip to content

Commit 0d6f927

Browse files
committed
feat: update the reporting of file encoding issues
This commit changes the errors and warnings reported when EPUBCheck detects an invalid or non-recommended file encoding. The following errors codes are now reported: - `RSC-027` (new): warning reported for XML documents encoded in UTF-16 - `RSC-028` (new): error reported for XML documents in an invalid encoding - `HTM-058` (new): error reported for XHTML encoded in UTF-16 (the HTML standard has an authoring requirement for UTF-8) - `CSS-003` (updated): warning reported for CSS encoded in UTF-16 - `CSS-004` (updated): error reported for CSS in an invalid encoding Note: previously both `CSS-003` and `CSS-004` were errors, reported for a disallowed encoding. The only difference is that one was reported when the encoded was detected from a BOM, the other from a `@charset` declaration. This commit repurposes `CSS-003` as the warning raised for UTF-16, and `CSS-004` as the disallowed-encoding error. Fixes #1245
1 parent 111e772 commit 0d6f927

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+318
-27
lines changed

src/main/java/com/adobe/epubcheck/css/CSSChecker.java

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,17 +138,26 @@ CssSource getCssSource()
138138
if (source.getInputStream().getBomCharset().isPresent())
139139
{
140140
charset = source.getInputStream().getBomCharset().get().toLowerCase(Locale.ROOT);
141-
if (!charset.equals("utf-8") && !charset.startsWith("utf-16"))
141+
if (!charset.equals("utf-8"))
142142
{
143-
report.message(MessageId.CSS_004, EPUBLocation.of(context), charset);
143+
if (charset.startsWith("utf-16"))
144+
{
145+
report.message(MessageId.CSS_003, EPUBLocation.of(context), charset);
146+
} else {
147+
report.message(MessageId.CSS_004, EPUBLocation.of(context), charset);
148+
}
144149
}
145-
}
146-
if (source.getInputStream().getCssCharset().isPresent())
150+
} else if (source.getInputStream().getCssCharset().isPresent())
147151
{
148152
charset = source.getInputStream().getCssCharset().get().toLowerCase(Locale.ROOT);
149-
if (!charset.equals("utf-8") && !charset.startsWith("utf-16"))
153+
if (!charset.equals("utf-8"))
150154
{
151-
report.message(MessageId.CSS_003, EPUBLocation.of(context), charset);
155+
if (charset.startsWith("utf-16"))
156+
{
157+
report.message(MessageId.CSS_003, EPUBLocation.of(context), charset);
158+
} else {
159+
report.message(MessageId.CSS_004, EPUBLocation.of(context), charset);
160+
}
152161
}
153162
}
154163
}

src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ private void initialize()
6969
// CSS
7070
severities.put(MessageId.CSS_001, Severity.ERROR);
7171
severities.put(MessageId.CSS_002, Severity.ERROR);
72-
severities.put(MessageId.CSS_003, Severity.ERROR);
72+
severities.put(MessageId.CSS_003, Severity.WARNING);
7373
severities.put(MessageId.CSS_004, Severity.ERROR);
7474
severities.put(MessageId.CSS_005, Severity.ERROR);
7575
severities.put(MessageId.CSS_006, Severity.USAGE);
@@ -141,6 +141,7 @@ private void initialize()
141141
severities.put(MessageId.HTM_055, Severity.WARNING);
142142
severities.put(MessageId.HTM_056, Severity.ERROR);
143143
severities.put(MessageId.HTM_057, Severity.ERROR);
144+
severities.put(MessageId.HTM_058, Severity.ERROR);
144145

145146
// Media
146147
severities.put(MessageId.MED_001, Severity.ERROR);
@@ -337,6 +338,8 @@ private void initialize()
337338
severities.put(MessageId.RSC_024, Severity.USAGE);
338339
severities.put(MessageId.RSC_025, Severity.USAGE);
339340
severities.put(MessageId.RSC_026, Severity.ERROR);
341+
severities.put(MessageId.RSC_027, Severity.WARNING);
342+
severities.put(MessageId.RSC_028, Severity.ERROR);
340343

341344
// Scripting
342345
severities.put(MessageId.SCP_001, Severity.SUPPRESSED); // checking scripts is out of scope

src/main/java/com/adobe/epubcheck/messages/MessageId.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ public enum MessageId implements Comparable<MessageId>
135135
HTM_055("HTM_055"),
136136
HTM_056("HTM_056"),
137137
HTM_057("HTM_057"),
138+
HTM_058("HTM_058"),
138139

139140
// Messages associated with media (images, audio and video)
140141
MED_001("MED-001"),
@@ -331,6 +332,8 @@ public enum MessageId implements Comparable<MessageId>
331332
RSC_024("RSC-024"),
332333
RSC_025("RSC-025"),
333334
RSC_026("RSC-026"),
335+
RSC_027("RSC-027"),
336+
RSC_028("RSC-028"),
334337

335338
// Messages relating to scripting
336339
SCP_001("SCP-001"),

src/main/java/com/adobe/epubcheck/xml/EncodingSniffer.java renamed to src/main/java/com/adobe/epubcheck/xml/XMLEncodingSniffer.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import java.io.InputStream;
55
import java.util.Locale;
66

7-
public final class EncodingSniffer
7+
public final class XMLEncodingSniffer
88
{
99

1010
private static final byte[][] UTF16_MAGIC = { { (byte) 0xFE, (byte) 0xFF },
@@ -108,7 +108,7 @@ public static String sniffEncoding(InputStream in)
108108
return encoding.toUpperCase(Locale.ROOT);
109109
}
110110

111-
private EncodingSniffer()
111+
private XMLEncodingSniffer()
112112
{
113113
// Not instanciable.
114114
}

src/main/java/com/adobe/epubcheck/xml/XMLParser.java

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import javax.xml.parsers.SAXParser;
3030
import javax.xml.parsers.SAXParserFactory;
3131

32+
import org.w3c.epubcheck.constants.MIMEType;
3233
import org.xml.sax.ContentHandler;
3334
import org.xml.sax.ErrorHandler;
3435
import org.xml.sax.InputSource;
@@ -135,13 +136,33 @@ public void process()
135136
}
136137

137138
// Check encoding
138-
String encoding = EncodingSniffer.sniffEncoding(buffered);
139-
if (encoding != null && !encoding.equals("UTF-8") && !encoding.equals("UTF-16"))
139+
// If the result is null, the XML parser will must parse it as UTF-8
140+
String encoding = XMLEncodingSniffer.sniffEncoding(buffered);
141+
if (encoding != null && !encoding.equals("UTF-8"))
140142
{
141-
report.message(MessageId.CSS_003, EPUBLocation.of(context), encoding);
143+
if (encoding.equals("UTF-16"))
144+
{
145+
// XHTML requires UTF-8, UTF-16 is reported as an error
146+
if (MIMEType.XHTML.is(context.mimeType))
147+
{
148+
report.message(MessageId.HTM_058, EPUBLocation.of(context));
149+
}
150+
// For other XML types, UTF-16 is reported as a warning
151+
else
152+
{
153+
report.message(MessageId.RSC_027, EPUBLocation.of(context));
154+
}
155+
}
156+
else
157+
{
158+
report.message(MessageId.RSC_028, EPUBLocation.of(context), encoding);
159+
}
142160
}
143161

144162
// Build the input source
163+
// We do not set the source encoding name, but instead let the SAXParser
164+
// apply its own encoding-sniffing logic, as it can report useful errors
165+
// (for instance a mismatch between a BOM and the XML declaration)
145166
InputSource source = new InputSource(buffered);
146167
source.setSystemId(url.toString());
147168

@@ -163,7 +184,8 @@ public void process()
163184
} catch (SAXException e)
164185
{
165186
// All errors should have already been reported by the error handler
166-
if (report.getFatalErrorCount() == 0) {
187+
if (report.getFatalErrorCount() == 0)
188+
{
167189
report.message(MessageId.RSC_016, EPUBLocation.of(context), e.getMessage());
168190
}
169191
}

src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ CHK_008=Error encountered while processing an item "%1$s"; skip other checks for
4040
#CSS
4141
CSS_001=The "%1$s" property must not be included in an EPUB Style Sheet.
4242
CSS_002=Empty or NULL reference found.
43-
CSS_003=Only UTF-8 and UTF-16 encodings are allowed, detected %1$s.
44-
CSS_004=Only UTF-8 and UTF-16 encodings are allowed, detected %1$s BOM.
43+
CSS_003=CSS document is encoded in UTF-16. It should be encoded in UTF-8 instead.
44+
CSS_004=CSS documents must be encoded in UTF-8, detected %1%s;
4545
CSS_005=Conflicting alternate style attributes found: %1$s.
4646
CSS_006=CSS selector specifies fixed position.
4747
CSS_007=Font-face reference "%1$s" refers to non-standard font type "%2$s".
@@ -68,7 +68,7 @@ CSS_025=CSS class Selector could not be found.
6868
CSS_025_SUG=Check for typos or define a class selector to document the use of the class.
6969
CSS_028=Use of Font-face declaration.
7070
CSS_029=Found CSS class name "%1$s" but no "%2$s" property was declared in the package document.
71-
CSS_030=The package document declares media overlays styling class names but no CSS was found in the content document.
71+
CSS_030=The package document declares media overlays styling class names but no CSS was found in the content document.
7272

7373
#HTM - XHTML related messages
7474
HTM_001=Any publication resource that is an XML-based media type must be a valid XML 1.0 document. XML version found: %1$s.
@@ -125,7 +125,8 @@ HTM_053=Found an external file link (file://) in file: "%1$s".
125125
HTM_054=Custom attribute namespace ("%1$s") must not include the string "%2$s" in its domain.
126126
HTM_055=The "%1$s" element should not be used (discouraged construct)
127127
HTM_056=Viewport metadata has no "%1$s" dimension (both "width" and "height" properties are required)
128-
HTM_057=Viewport "%1$s" value must be a positive number or the keyword "device-%1$s"
128+
HTM_057=Viewport "%1$s" value must be a positive number or the keyword "device-%1$s"
129+
HTM_058=HTML documents must be encoded in UTF-8, but UTF-16 was detected.
129130

130131
#media
131132
MED_001=Video poster must have core media image type.
@@ -346,7 +347,9 @@ RSC_022=Cannot check image details (requires Java version 7 or higher).
346347
RSC_023=Couldn’t parse host of URL "%1$s" (probably due to disallowed characters or missing slashes after the protocol)
347348
RSC_024=Informative parsing warning: %1$s
348349
RSC_025=Informative parsing error: %1$s
349-
RSC_026=URL "%1$s" leaks outside the container (it is not a valid-relative-ocf-URL-with-fragment string)
350+
RSC_026=URL "%1$s" leaks outside the container (it is not a valid-relative-ocf-URL-with-fragment string)
351+
RSC_027=XML document is encoded in UTF-16. It should be encoded in UTF-8 instead.
352+
RSC_028=XML documents must be encoded in UTF-8, but %1%s was detected.
350353

351354
#Scripting
352355
SCP_001=Use of Javascript eval() function in EPUB scripts is a security risk.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?xml version="1.0" encoding="ISO-8859-1"?>
2+
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
3+
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
<dc:title id="title">Minimal EPUB 3.0</dc:title>
5+
<dc:language>en</dc:language>
6+
<dc:identifier id="q">NOID</dc:identifier>
7+
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
8+
</metadata>
9+
<manifest>
10+
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
11+
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
12+
</manifest>
13+
<spine>
14+
<itemref idref="content_001" />
15+
</spine>
16+
</package>
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?xml version="1.0" encoding="unknown"?>
2+
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
3+
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
<dc:title id="title">Minimal EPUB 3.0</dc:title>
5+
<dc:language>en</dc:language>
6+
<dc:identifier id="q">NOID</dc:identifier>
7+
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
8+
</metadata>
9+
<manifest>
10+
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
11+
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
12+
</manifest>
13+
<spine>
14+
<itemref idref="content_001" />
15+
</spine>
16+
</package>

0 commit comments

Comments
 (0)