Skip to content

Commit 0323668

Browse files
committed
fix: normalize URLs before checking if the resources exist
URLs were not normalized before performing existence checks. So percent-encoded URLs sometimes triggered `RSC-001` or `RSC-007` errors. This commit introduces a new `normalize(URL)` method in the `URLUtils` class. Normalization is now used when checking a URL. This notably applies to resource and ID existence checks. Important Note: URL normalization is not well-defined. Some percent-encoding normalization is described in RFC3986, but is not defined in the URL standard. Also, normalization (as useful for EPUBCheck) is also dependent on the URL scheme. The normalization we apply is quite naïve and might need to be improved in the future. It should however cover the majority of HTTP URL real-world scenarios. Fix #1479
1 parent 0f0cece commit 0323668

File tree

9 files changed

+90
-7
lines changed

9 files changed

+90
-7
lines changed

src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,14 @@ public OCFContainer(Builder builder)
7979

8080
public boolean contains(URL resource)
8181
{
82-
return resources.containsKey(resource);
82+
if (resources.containsKey(resource))
83+
{
84+
return true;
85+
}
86+
else
87+
{
88+
return resources.containsKey(URLUtils.normalize(resource));
89+
}
8390
}
8491

8592
@Override
@@ -134,5 +141,4 @@ public boolean isRemote(URL url)
134141
}
135142
}
136143

137-
138144
}

src/main/java/org/w3c/epubcheck/core/references/URLChecker.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import java.net.URI;
44

5+
import org.w3c.epubcheck.util.url.URLUtils;
6+
57
import com.adobe.epubcheck.api.EPUBLocation;
68
import com.adobe.epubcheck.api.Report;
79
import com.adobe.epubcheck.messages.MessageId;
@@ -64,7 +66,7 @@ public URL setBase(String newBase, EPUBLocation location)
6466

6567
public URL checkURL(String string, EPUBLocation location)
6668
{
67-
URL url = resolveURL(string, false, location);
69+
URL url = URLUtils.normalize(resolveURL(string, false, location));
6870
return url;
6971
}
7072

src/main/java/org/w3c/epubcheck/util/url/URLUtils.java

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import io.mola.galimatias.GalimatiasParseException;
1515
import io.mola.galimatias.ParseIssue;
1616
import io.mola.galimatias.URL;
17+
import io.mola.galimatias.canonicalize.DecodeUnreservedCanonicalizer;
1718

1819
//FIXME 2022 add unit tests
1920
public final class URLUtils
@@ -79,9 +80,9 @@ else if (urlA.equals(urlB))
7980
* in EPUB (to test for remote resources compared to container URLs).
8081
*
8182
* @param test
82-
* the URL to test
83+
* the URL to test
8384
* @param local
84-
* the URL it is tested against
85+
* the URL it is tested against
8586
* @return `true` if and only if `test` is remote compared to `local`.
8687
*/
8788
public static boolean isRemote(URL test, URL local)
@@ -151,13 +152,33 @@ public static String decode(String string)
151152
return percentDecode(string);
152153
}
153154

155+
public static URL normalize(URL url)
156+
{
157+
URL normalized = url;
158+
if (url != null)
159+
{
160+
try
161+
{
162+
if (url.isHierarchical() && url.path() != null)
163+
{
164+
normalized = url.withPath(URLUtils.encodePath(URLUtils.decode(url.path())));
165+
}
166+
normalized = new DecodeUnreservedCanonicalizer().canonicalize(normalized);
167+
} catch (GalimatiasParseException unexpected)
168+
{
169+
throw new AssertionError(unexpected);
170+
}
171+
}
172+
return normalized;
173+
}
174+
154175
/**
155176
* Returns the MIME type of a `data:` URL.
156177
*
157178
* @param url
158-
* a URL, can be `null`.
179+
* a URL, can be `null`.
159180
* @return the MIME type declared in the data URL (can be an empty string), or
160-
* `null` if `url` is not a data URL.
181+
* `null` if `url` is not a data URL.
161182
*/
162183
public static String getDataURLType(URL url)
163184
{
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<!DOCTYPE html>
2+
<html xmlns:epub="http://www.idpf.org/2007/ops" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
3+
<head>
4+
<meta charset="utf-8"/>
5+
<title>Minimal EPUB</title>
6+
</head>
7+
<body>
8+
<h1>Loomings</h1>
9+
<p>Call me Ishmael.</p>
10+
</body>
11+
</html>
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<!DOCTYPE html>
2+
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en" lang="en">
3+
<head>
4+
<meta charset="utf-8"/>
5+
<title>Minimal Nav</title>
6+
</head>
7+
<body>
8+
<nav epub:type="toc">
9+
<ol>
10+
<li><a href="content%26001.xhtml">content 001</a></li>
11+
</ol>
12+
</nav>
13+
</body>
14+
</html>
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
3+
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
4+
<dc:title id="title">Minimal EPUB 3.0</dc:title>
5+
<dc:language>en</dc:language>
6+
<dc:identifier id="q">NOID</dc:identifier>
7+
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
8+
</metadata>
9+
<manifest>
10+
<item id="content_001" href="content%26001.xhtml" media-type="application/xhtml+xml"/>
11+
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
12+
</manifest>
13+
<spine>
14+
<itemref idref="content_001" />
15+
</spine>
16+
</package>
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
<?xml version="1.0" encoding="UTF-8" ?>
2+
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
3+
<rootfiles>
4+
<rootfile full-path="EPUB/package.opf" media-type="application/oebps-package+xml"/>
5+
</rootfiles>
6+
</container>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
application/epub+zip

src/test/resources/epub3/04-ocf/ocf.feature

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ Feature: EPUB 3 — Open Container Format
114114
When checking EPUB 'url-in-xhtml-valid.xhtml'
115115
And no errors or warnings are reported
116116

117+
@spec @xref:sec-container-iri
118+
Scenario: Allow percent-encoded URLs
119+
When checking EPUB 'url-percent-encoded-valid'
120+
And no errors or warnings are reported
121+
122+
117123
#### Invalid container URLs
118124

119125
@spec @xref:sec-container-iri

0 commit comments

Comments
 (0)