Skip to content

Commit d2728ee

Browse files
committed
fix: improve reporting of invalid URL host parts
- fix #1034: make the message more genric ("Couldn't parse host…") - fix #1079: don't report underscores used in the URL host part - expand test `testValidateXHTMLUrlChecksInvalid`
1 parent 6af3b98 commit d2728ee

File tree

4 files changed

+21
-10
lines changed

4 files changed

+21
-10
lines changed

src/main/java/com/adobe/epubcheck/ops/OPSHandler.java

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222

2323
package com.adobe.epubcheck.ops;
2424

25+
import java.net.MalformedURLException;
2526
import java.net.URI;
2627
import java.net.URISyntaxException;
28+
import java.net.URL;
2729
import java.util.Locale;
2830
import java.util.Stack;
2931

@@ -226,15 +228,22 @@ else if (".".equals(href))
226228
{
227229
report.info(path, FeatureEnum.REFERENCE, href);
228230

229-
/*
230-
* #708 report invalid HTTP/HTTPS URLs
231-
* uri.scheme may be correct, but missing a : or a / from the //
232-
* leads to uri.getHost() == null
233-
*/
231+
// Report if the host part couldn't be parsed correctly
232+
// (either due to missing slashes (issue #708) or invalid characters (issue #1034)
234233
if (uri.getHost() == null)
235234
{
236-
int missingSlashes = uri.getSchemeSpecificPart().startsWith("/") ? 1 : 2;
237-
report.message(MessageId.RSC_023, parser.getLocation(), uri, missingSlashes, uri.getScheme());
235+
try
236+
{
237+
// if the URL contains underscore characters, try reparsing it without them,
238+
// as underscores are accepted by browsers in the host part (even if it's disallowed)
239+
// see issue #1079
240+
if (!href.contains("_") || new URI(href.replace('_', 'x')).getHost() == null) {
241+
report.message(MessageId.RSC_023, parser.getLocation(), uri);
242+
}
243+
} catch (URISyntaxException ignored)
244+
{
245+
// ignored (well-formedness errors are caught earlier)
246+
}
238247
}
239248
}
240249

src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ RSC_019=EPUBs with Multiple Renditions should contain a META-INF/metadata.xml fi
326326
RSC_020='%1$s' is not a valid URI.
327327
RSC_021=A Search Key Map Document must point to Content Documents ('%1$s' was not found in the spine).
328328
RSC_022=Cannot check image details (requires Java version 7 or higher).
329-
RSC_023=The URL '%1$s' is missing %2$d slash(es) '/' after the protocol '%3$s:'
329+
RSC_023=Couldn't parse host of URL '%1$s' (probably due to disallowed characters or missing slashes after the protocol)
330330

331331
#Scripting
332332
SCP_001=Use of Javascript eval() function in EPUB scripts is a security risk.

src/test/java/com/adobe/epubcheck/ops/OPSCheckerTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ public void testValidateXHTMLLINKInvalid()
230230
public void testValidateXHTMLUrlChecksInvalid()
231231
{
232232
Collections.addAll(expectedErrors, MessageId.RSC_020);
233-
Collections.addAll(expectedWarnings, MessageId.HTM_025, MessageId.RSC_023, MessageId.RSC_023);
233+
Collections.addAll(expectedWarnings, MessageId.HTM_025, MessageId.RSC_023, MessageId.RSC_023, MessageId.RSC_023);
234234
testValidateDocument("xhtml/invalid/url-checks_issue-708.xhtml", "application/xhtml+xml",
235235
EPUBVersion.VERSION_3);
236236
}

src/test/resources/30/single/xhtml/invalid/url-checks_issue-708.xhtml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
<a href="httpf://www.youtube.com/watch?v=xxxxxxxxxxx">Unsupported URI scheme (HTM-025)</a>
1212
<a href="https:/www.youtube.com/watch?v=xxxxxxxxxxx">URL is missing slashes after protocol (RSC-023)</a>
1313
<a href="https:www.youtube.com/watch?v=xxxxxxxxxxx">URL is missing slashes after protocol (RSC-023)</a>
14-
14+
<a href="https://w,w.example.com/watch?v=xxxxxxxxxxx">Host contains an invalid character (RSC-023)</a>
15+
16+
<a href="https://w_w.example.com">Underscore in hosts are accepted in most browsers</a>
1517
<a href="https://www.youtube.com/watch?v=xxxxxxxxxxx">Valid URI</a>
1618
<a href="https://youtube.com/watch?v=xxxxxxxxxxx">Valid URI</a>
1719
<a href="https://youtube.com/watch?v=xxxxxx%20xxxx">Valid URI</a>

0 commit comments

Comments
 (0)