Skip to content

Commit 5248914

Browse files
committed
feat: improve detection of non-UTF-8 file names
- explicitly read ZIP archives as UTF-8 encoded - report a (new) fatal error PKG-027 when the a zip could not be read due to encoding issues (detected by matching the exception message) - add tests Fixes #1236
1 parent 62d67e7 commit 5248914

File tree

10 files changed

+104
-65
lines changed

10 files changed

+104
-65
lines changed

src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ private void initialize()
307307
severities.put(MessageId.PKG_024, Severity.INFO);
308308
severities.put(MessageId.PKG_025, Severity.ERROR);
309309
severities.put(MessageId.PKG_026, Severity.ERROR);
310+
severities.put(MessageId.PKG_027, Severity.FATAL);
310311

311312
// Resources
312313
severities.put(MessageId.RSC_001, Severity.ERROR);

src/main/java/com/adobe/epubcheck/messages/MessageId.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ public enum MessageId implements Comparable<MessageId>
301301
PKG_024("PKG-024"),
302302
PKG_025("PKG-025"),
303303
PKG_026("PKG-026"),
304+
PKG_027("PKG-027"),
304305

305306
// Messages relating to resources
306307
RSC_001("RSC-001"),

src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java

Lines changed: 76 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,11 @@ public void check()
8585
// Check the OCF Container file structure
8686
// --------------------------------------
8787
//
88-
checkContainerStructure(state);
88+
if (!checkContainerStructure(state))
89+
{
90+
return;
91+
}
92+
;
8993
OCFContainer container = state.getContainer();
9094

9195
//
@@ -270,83 +274,92 @@ private boolean checkContainerFile(OCFCheckerState state)
270274
return true;
271275
}
272276

273-
private void checkContainerStructure(OCFCheckerState state)
277+
private boolean checkContainerStructure(OCFCheckerState state)
274278
{
275-
// Get a container
276-
Iterable<OCFResource> resourcesProvider;
277279
try
278280
{
279281
// FIXME 2022 build resourcesProvider depending on MIME type
280-
resourcesProvider = new OCFZipResources(context.url);
281-
} catch (IOException e)
282-
{
283-
// FIXME 2022 see how to propagate fatal IOError
284-
report.message(MessageId.PKG_008, EPUBLocation.of(context), e.getLocalizedMessage());
285-
return;
286-
}
287-
// Map to store the container resource files
288-
Map<String, OCFResource> resources = new HashMap<>();
289-
// List to store the container resource directories
290-
List<String> directories = new LinkedList<>();
291-
292-
// Loop through the entries
293-
OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
294-
for (OCFResource resource : resourcesProvider)
295-
{
296-
Preconditions.checkNotNull(resource.getPath());
297-
Preconditions.checkNotNull(resource.getProperties());
282+
// Get a container
283+
Iterable<OCFResource> resourcesProvider = new OCFZipResources(context.url);
284+
// Map to store the container resource files
285+
Map<String, OCFResource> resources = new HashMap<>();
286+
// List to store the container resource directories
287+
List<String> directories = new LinkedList<>();
288+
289+
// Loop through the entries
290+
OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
291+
// FIXME catch IAE MALFORMED entries
292+
for (OCFResource resource : resourcesProvider)
293+
{
294+
Preconditions.checkNotNull(resource.getPath());
295+
Preconditions.checkNotNull(resource.getProperties());
298296

299-
// FIXME 2022 report symbolic links and continue
297+
// FIXME 2022 report symbolic links and continue
300298

301-
// Check duplicate entries
302-
if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT)))
303-
{
304-
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
305-
}
306-
// Check duplicate entries after NFC normalization
307-
else if (resources.containsKey(
308-
Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
309-
{
310-
context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
311-
}
299+
// Check duplicate entries
300+
if (resources.containsKey(resource.getPath().toLowerCase(Locale.ROOT)))
301+
{
302+
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
303+
}
304+
// Check duplicate entries after NFC normalization
305+
else if (resources.containsKey(
306+
Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
307+
{
308+
context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
309+
}
312310

313-
// Store the resource in the data structure
314-
if (resource.isDirectory())
315-
{
316-
// the container resource is a directory,
317-
// store it for later checking of empty directories
318-
directories.add(resource.getPath());
319-
}
320-
else
321-
{
322-
// Check file name requirements
323-
filenameChecker.checkCompatiblyEscaped(resource.getPath());
324-
325-
// report entry metadata
326-
reportFeatures(resource.getProperties());
327-
// the container resource is a file,
328-
// add the resource to the container model
329-
resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource);
330-
state.addResource(resource);
311+
// Store the resource in the data structure
312+
if (resource.isDirectory())
313+
{
314+
// the container resource is a directory,
315+
// store it for later checking of empty directories
316+
directories.add(resource.getPath());
317+
}
318+
else
319+
{
320+
// Check file name requirements
321+
filenameChecker.checkCompatiblyEscaped(resource.getPath());
322+
323+
// report entry metadata
324+
reportFeatures(resource.getProperties());
325+
// the container resource is a file,
326+
// add the resource to the container model
327+
resources.put(resource.getPath().toLowerCase(Locale.ROOT), resource);
328+
state.addResource(resource);
329+
}
331330
}
332-
}
333331

334-
// Report empty directories
335-
for (String directory : directories)
336-
{
337-
boolean hasContents = false;
338-
for (OCFResource resource : resources.values())
332+
// Report empty directories
333+
for (String directory : directories)
339334
{
340-
if (resource.getPath().startsWith(directory))
335+
boolean hasContents = false;
336+
for (OCFResource resource : resources.values())
337+
{
338+
if (resource.getPath().startsWith(directory))
339+
{
340+
hasContents = true;
341+
break;
342+
}
343+
}
344+
if (!hasContents)
341345
{
342-
hasContents = true;
343-
break;
346+
report.message(MessageId.PKG_014, EPUBLocation.of(context), directory);
344347
}
345348
}
346-
if (!hasContents)
349+
return true;
350+
} catch (Exception e)
351+
{
352+
switch (e.getMessage())
347353
{
348-
report.message(MessageId.PKG_014, EPUBLocation.of(context), directory);
354+
case "invalid CEN header (bad entry name)": // reported by OpenJDK
355+
case "MALFORMED": // reported by Oracle JDK 1.8
356+
report.message(MessageId.PKG_027, EPUBLocation.of(context), e.getLocalizedMessage());
357+
break;
358+
default:
359+
report.message(MessageId.PKG_008, EPUBLocation.of(context), e.getLocalizedMessage());
360+
break;
349361
}
362+
return false;
350363
}
351364
}
352365

src/main/java/com/adobe/epubcheck/ocf/OCFZipResources.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import java.io.IOException;
55
import java.io.InputStream;
66
import java.net.URISyntaxException;
7+
import java.nio.charset.StandardCharsets;
78
import java.security.MessageDigest;
89
import java.util.Enumeration;
910
import java.util.Iterator;
@@ -32,7 +33,7 @@ public OCFZipResources(URL url) throws IOException
3233
{
3334
new IllegalArgumentException("Not a file URL: " + url);
3435
}
35-
this.zip = new ZipFile(file);
36+
this.zip = new ZipFile(file, StandardCharsets.UTF_8);
3637
}
3738

3839
@Override

src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,8 @@ PKG_023=Validating the EPUB against version 2.0, default validation profile will
316316
PKG_024=Uncommon EPUB file extension.
317317
PKG_024_SUG=For maximum compatibility, use ".epub".
318318
PKG_025=Publication resource must not be located in the META-INF directory
319-
PKG_026=Obfuscated resource must be a Font Core Media Type (was declared as "%1$s" in "%2$s").
319+
PKG_026=Obfuscated resource must be a Font Core Media Type (was declared as "%1$s" in "%2$s").
320+
PKG_027=Could not extract EPUB ZIP content, probably due to file names not encoded in UTF-8.
320321

321322
#Resources
322323
RSC_001=File "%1$s" could not be found.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

src/test/resources/epub3/04-ocf/ocf.feature

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,28 @@ Feature: EPUB 3 — Open Container Format
181181
Then error OPF-060 is reported
182182
And no other errors or warnings are reported
183183

184+
@spec @xref:sec-zip-container-zipreqs
185+
Scenario: Verify file names with non-ASCII UTF-8-encoded character are allowed
186+
When checking EPUB 'ocf-filename-utf8-valid.epub'
187+
Then no errors or warnings are reported
188+
189+
@spec @xref:sec-zip-container-zipreqs
190+
Scenario: Report file names that are not encoded as UTF-8
191+
When checking EPUB 'ocf-filename-not-utf8-error.epub'
192+
Then fatal error PKG-027 is reported
193+
Then no errors or warnings are reported
194+
195+
@spec @xref:sec-zip-container-zipreqs
196+
Scenario: Verify path names with non-ASCII UTF-8-encoded character are allowed
197+
When checking EPUB 'ocf-filepath-utf8-valid.epub'
198+
Then no errors or warnings are reported
199+
200+
@spec @xref:sec-zip-container-zipreqs
201+
Scenario: Report file names that are not encoded as UTF-8
202+
When checking EPUB 'ocf-filepath-not-utf8-error.epub'
203+
Then fatal error PKG-027 is reported
204+
Then no errors or warnings are reported
205+
184206

185207
### 4.2.3 OCF ZIP container media type idenfication
186208

0 commit comments

Comments
 (0)