Skip to content

Commit b6ac8ea

Browse files
committed
feat: improve checking of OCF file name characters
- significantly refactored `OCFFilenameChecker`: - now based on ICU4J for more up-to-date Unicode support - iterate over the input string codepoints (`int`) instead of code units (`char`) for better support of charactres outside the BMP. - rely on ICU UnicodeSet to define the set of forbidden characters - improve the message reported for forbidden characters (notably for non-printable characters) - the class now implements the `Checker` interface - add unit tests for the `OCFFilenameChecker` only in a new feature file `filename-checker.feature`, located along with the EPUB 3 OCF functional tests, for easier lookup. - add a few functional tests, to make sure `OCFFilenameChecker` is correctly called when checking full publications or single package documents. - update ICU4J to v72.1 Fixes #1240, fixes #1292, fixes #1302
1 parent 5c35fa0 commit b6ac8ea

File tree

34 files changed

+515
-248
lines changed

34 files changed

+515
-248
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@
173173
<dependency>
174174
<groupId>com.ibm.icu</groupId>
175175
<artifactId>icu4j</artifactId>
176-
<version>70.1</version>
176+
<version>72.1</version>
177177
</dependency>
178178
<dependency>
179179
<groupId>net.sf.saxon</groupId>

src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,6 @@ private boolean checkContainerStructure(OCFCheckerState state)
287287
List<String> directories = new LinkedList<>();
288288

289289
// Loop through the entries
290-
OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
291-
// FIXME catch IAE MALFORMED entries
292290
for (OCFResource resource : resourcesProvider)
293291
{
294292
Preconditions.checkNotNull(resource.getPath());
@@ -318,7 +316,7 @@ else if (resources.containsKey(
318316
else
319317
{
320318
// Check file name requirements
321-
filenameChecker.checkCompatiblyEscaped(resource.getPath());
319+
new OCFFilenameChecker(resource.getPath(), state.context().build()).check();;
322320

323321
// report entry metadata
324322
reportFeatures(resource.getProperties());
Lines changed: 123 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,127 +1,162 @@
11
package com.adobe.epubcheck.ocf;
22

3+
import java.util.LinkedHashSet;
34
import java.util.Set;
5+
import java.util.stream.Collectors;
6+
7+
import org.w3c.epubcheck.core.Checker;
48

59
import com.adobe.epubcheck.api.EPUBLocation;
610
import com.adobe.epubcheck.api.Report;
711
import com.adobe.epubcheck.messages.MessageId;
812
import com.adobe.epubcheck.opf.ValidationContext;
913
import com.adobe.epubcheck.util.EPUBVersion;
10-
import com.google.common.collect.ImmutableSet;
14+
import com.google.common.base.Preconditions;
15+
import com.google.common.collect.ImmutableMap;
16+
import com.ibm.icu.lang.UCharacter;
17+
import com.ibm.icu.text.UCharacterIterator;
18+
import com.ibm.icu.text.UForwardCharacterIterator;
19+
import com.ibm.icu.text.UnicodeSet;
1120

12-
//FIXME 2022 update related PKG-* messages to contain the file name string
13-
public final class OCFFilenameChecker
21+
public final class OCFFilenameChecker implements Checker
1422
{
15-
private static final Set<String> RESTRICTED_30_CHARACTER_SET = ImmutableSet.of("PRIVATE_USE_AREA",
16-
"ARABIC_PRESENTATION_FORMS_A", "SPECIALS", "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
17-
"SUPPLEMENTARY_PRIVATE_USE_AREA_B", "VARIATION_SELECTORS_SUPPLEMENT", "TAGS");
23+
24+
private static final UnicodeSet ASCII = new UnicodeSet("[:ascii:]").freeze();
25+
26+
private static final UnicodeSet DISALLOWED_EPUB2 = new UnicodeSet()
27+
// .add(0x002F)// SOLIDUS '/' -- allowed as path separator
28+
.add(0x0022)// QUOTATION MARK '"'
29+
.add(0x002A)// ASTERISK '*'
30+
// .add(0x002E)// FULL STOP '.' -- only disallowed as the last character
31+
.add(0x003A)// COLON ':'
32+
.add(0x003C)// LESS-THAN SIGN '<'
33+
.add(0x003E)// GREATER-THAN SIGN '>'
34+
.add(0x003F)// QUESTION MARK '?'
35+
.add(0x005C)// REVERSE SOLIDUS '\'
36+
.freeze();
37+
38+
private static final ImmutableMap<String, UnicodeSet> DISALLOWED_EPUB3 = new ImmutableMap.Builder<String, UnicodeSet>()
39+
.put("ASCII", new UnicodeSet() //
40+
.addAll(DISALLOWED_EPUB2)// all disallowed in EPUB 2.0.1
41+
.add(0x007C) // VERTICAL LINE '|'
42+
.freeze())
43+
.put("NON CHARACTER", new UnicodeSet("[:Noncharacter_Code_Point=Yes:]")//
44+
.freeze())
45+
.put("CONTROL", new UnicodeSet().add(0x007F) // DEL
46+
.addAll(0x0000, 0x001F) // C0 range
47+
.addAll(0x0080, 0x009F) // C1 range
48+
.freeze())
49+
.put("PRIVATE USE", new UnicodeSet() //
50+
.addAll(0xE000, 0xF8FF) // Private Use Area
51+
.addAll(0xF0000, 0xFFFFF) // Supplementary Private Use Area-A
52+
.addAll(0x100000, 0x10FFFF) // Supplementary Private Use Area-B
53+
.freeze())
54+
.put("SPECIALS", new UnicodeSet() //
55+
.addAll(0xFFF0, 0xFFFF) // Specials Blocks
56+
.freeze())
57+
.put("DEPRECATED", new UnicodeSet() //
58+
.add(0xE0001)// LANGUAGE TAG
59+
// .add(0xE007F)// CANCEL TAG -- reinstated in Emoji tag sequences
60+
.freeze())
61+
.build();
62+
63+
private static String toString(int codepoint, String setName)
64+
{
65+
assert setName != null;
66+
StringBuilder result = new StringBuilder().append(String.format("U+%04X ", codepoint));
67+
if ("ASCII".equals(setName))
68+
{
69+
result.append('(').append(UCharacter.toString(codepoint)).append(')');
70+
}
71+
else
72+
{
73+
String characterName = UCharacter.getName(codepoint);
74+
if (characterName != null)
75+
{
76+
result.append(characterName).append(' ');
77+
}
78+
result.append('(').append(setName).append(')');
79+
}
80+
return result.toString();
81+
}
1882

1983
private final Report report;
2084
private final EPUBVersion version;
2185
private final EPUBLocation location;
86+
private final String filename;
87+
88+
public OCFFilenameChecker(String filename, ValidationContext context)
89+
{
90+
this(filename, context, null);
91+
}
2292

23-
public OCFFilenameChecker(ValidationContext context)
93+
public OCFFilenameChecker(String filename, ValidationContext context, EPUBLocation location)
2494
{
95+
Preconditions.checkArgument(filename != null);
96+
Preconditions.checkArgument(context != null);
97+
this.filename = filename;
2598
this.report = context.report;
2699
this.version = context.version;
27-
this.location = EPUBLocation.of(context);
100+
this.location = (location != null) ? location : EPUBLocation.of(context);
28101
}
29102

30-
public String checkCompatiblyEscaped(final String str)
103+
@Override
104+
public void check()
31105
{
32-
// don't check remote resources
33-
if (str.matches("^[^:/?#]+://.*"))
34-
{
35-
return "";
36-
}
37-
38-
// the test string will be used to compare test result
39-
String test = checkNonAsciiFilename(str);
40-
41-
if (str.endsWith("."))
42-
{
43-
report.message(MessageId.PKG_011, location, str);
44-
test += ".";
45-
}
46-
47-
boolean spaces = false;
48-
final char[] ascciGraphic = new char[] { '<', '>', '"', '{', '}', '|', '^', '`', '*',
49-
'?' /* , ':','/', '\\' */ };
50-
String result = "";
51-
char[] chars = str.toCharArray();
52-
for (char c : chars)
106+
// Iterate through the code points to search disallowed characters
107+
UCharacterIterator chars = UCharacterIterator.getInstance(filename);
108+
final Set<String> disallowed = new LinkedHashSet<>();
109+
boolean hasSpaces = false;
110+
boolean isASCIIOnly = true;
111+
int codepoint;
112+
while ((codepoint = chars.nextCodePoint()) != UForwardCharacterIterator.DONE)
53113
{
54-
for (char a : ascciGraphic)
114+
// Check if the string has non-ASCII characters
115+
isASCIIOnly = isASCIIOnly && ASCII.contains(codepoint);
116+
// Check if the string has space characters
117+
hasSpaces = hasSpaces || UCharacter.isUWhiteSpace(codepoint);
118+
// Check for disallowed characters
119+
switch (version)
55120
{
56-
if (c == a)
121+
case VERSION_2:
122+
if (DISALLOWED_EPUB2.contains(codepoint))
57123
{
58-
result += "\"" + Character.toString(c) + "\",";
59-
test += Character.toString(c);
124+
disallowed.add(toString(codepoint, "ASCII"));
60125
}
61-
}
62-
if (Character.isSpaceChar(c))
63-
{
64-
spaces = true;
65-
test += Character.toString(c);
126+
break;
127+
default:
128+
for (String name : DISALLOWED_EPUB3.keySet())
129+
{
130+
if (DISALLOWED_EPUB3.get(name).contains(codepoint))
131+
{
132+
disallowed.add(toString(codepoint, name));
133+
break;
134+
}
135+
}
136+
break;
66137
}
67138
}
68-
if (result.length() > 1)
139+
// Check that FULL STOP is not used as the last character
140+
if (chars.previousCodePoint() == 0x002E)
69141
{
70-
result = result.substring(0, result.length() - 1);
71-
report.message(MessageId.PKG_009, location, str, result);
142+
report.message(MessageId.PKG_011, location, filename);
72143
}
73-
if (spaces)
144+
// Report if disallowed characters were found
145+
if (!disallowed.isEmpty())
74146
{
75-
report.message(MessageId.PKG_010, location, str);
147+
report.message(MessageId.PKG_009, location, filename,
148+
disallowed.stream().collect(Collectors.joining(", ")));
76149
}
77-
78-
if (version == EPUBVersion.VERSION_3)
150+
// Report whitespace characters
151+
if (hasSpaces)
79152
{
80-
checkCompatiblyEscaped30(str, test);
153+
report.message(MessageId.PKG_010, location, filename);
81154
}
82-
return test;
83-
}
84-
85-
private String checkNonAsciiFilename(final String str)
86-
{
87-
String nonAscii = str.replaceAll("[\\p{ASCII}]", "");
88-
if (nonAscii.length() > 0)
155+
// Report non-ASCII characters as usage
156+
if (!isASCIIOnly)
89157
{
90-
report.message(MessageId.PKG_012, location, str, nonAscii);
158+
report.message(MessageId.PKG_012, location, filename);
91159
}
92-
return nonAscii;
93160
}
94161

95-
private String checkCompatiblyEscaped30(String str, String test)
96-
{
97-
String result = "";
98-
99-
char[] chars = str.toCharArray();
100-
for (char c : chars)
101-
{
102-
if (Character.isISOControl(c))
103-
{
104-
result += "\"" + Character.toString(c) + "\",";
105-
test += Character.toString(c);
106-
}
107-
108-
// DEL (U+007F)
109-
if (c == '\u007F')
110-
{
111-
result += "\"" + Character.toString(c) + "\",";
112-
test += Character.toString(c);
113-
}
114-
String unicodeType = Character.UnicodeBlock.of(c).toString();
115-
if (RESTRICTED_30_CHARACTER_SET.contains(unicodeType))
116-
{
117-
result += "\"" + Character.toString(c) + "\",";
118-
}
119-
}
120-
if (result.length() > 1)
121-
{
122-
result = result.substring(0, result.length() - 1);
123-
report.message(MessageId.PKG_009, location, str, result);
124-
}
125-
return test;
126-
}
127162
}

src/main/java/com/adobe/epubcheck/opf/OPFChecker.java

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -207,16 +207,12 @@ protected boolean checkContent()
207207

208208
for (OPFItem item : opfHandler.getItems())
209209
{
210-
// only check Filename CompatiblyEscaped when in "-mode opf"
211-
// this is when 'xrefChecker' Object is null which is an indicator for
212-
// single file validation
213-
// (Had no better possibility in mind since "mode" isn't available in
214-
// OPFChecker.java)
215-
//
216-
// bugfix for issue 239
217-
if (!context.xrefChecker.isPresent())
210+
// only check the filename in single-file mode
211+
// (it is checked by the container checker in full-publication mode)
212+
// and for local resources (i.e. computed to a file URL)
213+
if (!context.container.isPresent() && !item.isRemote())
218214
{
219-
new OCFFilenameChecker(context).checkCompatiblyEscaped(item.getPath());
215+
new OCFFilenameChecker(item.getPath(), context, item.getLocation()).check();
220216
}
221217
if (!item.equals(opfHandler.getItemByURL(item.getURL()).orNull()))
222218
{

src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -297,10 +297,10 @@ PKG_005=The mimetype file has an extra field of length %1$s. The use of the extr
297297
PKG_006=Mimetype file entry is missing or is not the first file in the archive.
298298
PKG_007=Mimetype file should only contain the string "application/epub+zip" and should not be compressed.
299299
PKG_008=Unable to read file "%1$s".
300-
PKG_009=File name contains characters that are not allowed in OCF file names: "%1$s".
301-
PKG_010=Filename contains spaces, therefore URI escaping is necessary. Consider removing spaces from filename.
302-
PKG_011=Filename is not allowed to end with ".".
303-
PKG_012=File name contains the following non-ascii characters: %1$s. Consider changing the filename.
300+
PKG_009=The file name "%1$s" contains characters that are not allowed in OCF file names: %2$s.
301+
PKG_010=The file name "%1$s" contains spaces, which may create interoperability issues with older reading systems.
302+
PKG_011=The file name "%1$s" is not allowed to end with ".".
303+
PKG_012=The file name "%1$s" contains non-ASCII characters, which might create interoperability issues with older reading systems.
304304
PKG_013=The EPUB file includes multiple OPS renditions.
305305
PKG_014=The EPUB contains empty directory "%1$s".
306306
PKG_015=Unable to read EPUB contents: %1$s

0 commit comments

Comments
 (0)