Skip to content

Commit bec390e

Browse files
committed
feat: better parse URL fragment micro syntaxes
This commit introduce a new `URLFragment` class to represent URL fragments. Fragment strings are parsed into `URLFragment` instances using MIME type-specific logic, implementing some validity checks for a few micro syntaxes including: - shortand bare name IDs - scheme-based fragments - media fragments SVG and HTML/XHTML MIME types are supported. The parser is tested in the `url-fragment.feature` feature file, in a new `unit-tests` directory.
1 parent 2e474e3 commit bec390e

File tree

9 files changed

+636
-32
lines changed

9 files changed

+636
-32
lines changed

src/main/java/com/adobe/epubcheck/opf/OPFChecker.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ protected boolean checkPackage()
116116

117117
List<OPFItem> items = opfHandler.getItems();
118118
report.info(null, FeatureEnum.ITEMS_COUNT, Integer.toString(items.size()));
119+
120+
// Register package doc and items to the XRefChecker
121+
xrefChecker.registerResource(context.url, context.mimeType);
119122
for (OPFItem item : items)
120123
{
121124
xrefChecker.registerResource(item,

src/main/java/com/adobe/epubcheck/opf/OPFChecker30.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
import java.util.Iterator;
2626
import java.util.Set;
2727

28+
import org.w3c.epubcheck.url.URLFragment;
29+
2830
import com.adobe.epubcheck.api.EPUBLocation;
2931
import com.adobe.epubcheck.api.EPUBProfile;
3032
import com.adobe.epubcheck.api.FeatureReport.Feature;
@@ -387,7 +389,8 @@ private void checkPreviewCollection(ResourceCollection collection)
387389
}
388390
else
389391
{
390-
if (Optional.fromNullable(resource.getURL().fragment()).or("").startsWith("epubcfi("))
392+
URLFragment fragment = URLFragment.parse(resource.getURL());
393+
if (fragment.exists() && "epubcfi".equals(fragment.getScheme()))
391394
{
392395
report.message(MessageId.OPF_076, EPUBLocation.of(context));
393396
}

src/main/java/com/adobe/epubcheck/opf/XRefChecker.java

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,9 @@
3131
import java.util.Map;
3232
import java.util.Queue;
3333
import java.util.Set;
34-
import java.util.regex.Pattern;
3534

35+
import org.w3c.epubcheck.constants.MIMEType;
36+
import org.w3c.epubcheck.url.URLFragment;
3637
import org.w3c.epubcheck.url.URLUtils;
3738

3839
import com.adobe.epubcheck.api.EPUBLocation;
@@ -126,6 +127,7 @@ public static final class Builder
126127
private OPFItem item = null;
127128
private boolean hasItemFallback = false;
128129
private boolean hasImageFallback = false;
130+
public String mimetype;
129131

130132
public Builder url(URL url)
131133
{
@@ -137,6 +139,13 @@ public Builder item(OPFItem item)
137139
{
138140
this.url = item.getURL();
139141
this.item = item;
142+
this.mimetype = item.getMimeType();
143+
return this;
144+
}
145+
146+
public Builder mimetype(String mimetype)
147+
{
148+
this.mimetype = mimetype;
140149
return this;
141150
}
142151

@@ -231,8 +240,6 @@ public boolean isInSpine()
231240
}
232241
}
233242

234-
private static final Pattern REGEX_SVG_VIEW = Pattern.compile("svgView\\(.*\\)");
235-
236243
private final Map<URL, Resource> resources = new HashMap<URL, Resource>();
237244

238245
private final Set<URL> undeclared = new HashSet<URL>();
@@ -281,7 +288,7 @@ public Optional<OPFItem> getResource(URL url)
281288
* @param path
282289
* the path to a publication resource
283290
* @return an immutable {@link EnumSet} containing the types of references to
284-
* {@code path}.
291+
* {@code path}.
285292
*/
286293
public Set<Type> getTypes(URL resource)
287294
{
@@ -413,9 +420,15 @@ public void checkReferences()
413420
private void checkReference(URLReference reference)
414421
{
415422
Resource hostResource = resources.get(reference.location.url);
416-
Resource targetResource = resources.get(reference.targetDoc);
423+
424+
// Retrieve the Resource instance representing the targeted document
417425
// If the resource was not declared in the manifest,
418426
// we build a new Resource object for the data URL.
427+
Resource targetResource = resources.get(reference.targetDoc);
428+
String targetMimetype = (targetResource != null) ? targetResource.getMimeType() : "";
429+
430+
// Parse the URL fragment
431+
URLFragment fragment = URLFragment.parse(reference.url, targetMimetype);
419432

420433
// Check remote resources
421434
if (container.isRemote(reference.url)
@@ -470,15 +483,18 @@ else if (!undeclared.contains(reference.targetDoc)
470483
return;
471484
}
472485

473-
String mimetype = targetResource.getMimeType();
474-
475486
// Type-specific checks
476487
switch (reference.type)
477488
{
478489
case HYPERLINK:
490+
if ("epubcfi".equals(fragment.getScheme()))
491+
{
492+
break; // EPUB CFI is not supported
493+
}
479494
// if mimeType is null, we should have reported an error already
480-
if (!OPFChecker.isBlessedItemType(mimetype, version)
481-
&& !OPFChecker.isDeprecatedBlessedItemType(mimetype) && !targetResource.hasItemFallback())
495+
if (!OPFChecker.isBlessedItemType(targetMimetype, version)
496+
&& !OPFChecker.isDeprecatedBlessedItemType(targetMimetype)
497+
&& !targetResource.hasItemFallback())
482498
{
483499
report.message(MessageId.RSC_010,
484500
reference.location.context(container.relativize(reference.url)));
@@ -494,31 +510,35 @@ else if (!undeclared.contains(reference.targetDoc)
494510
case IMAGE:
495511
case PICTURE_SOURCE:
496512
case PICTURE_SOURCE_FOREIGN:
497-
if (reference.url.fragment() != null && !mimetype.equals("image/svg+xml"))
513+
if ("epubcfi".equals(fragment.getScheme()))
514+
{
515+
break; // EPUB CFI is not supported
516+
}
517+
if (fragment.exists() && !MIMEType.SVG.is(targetMimetype))
498518
{
499519
report.message(MessageId.RSC_009,
500520
reference.location.context(container.relativize(reference.url)));
501521
return;
502522
}
503523
// if mimeType is null, we should have reported an error already
504-
if (!OPFChecker.isBlessedImageType(mimetype, version))
524+
if (!OPFChecker.isBlessedImageType(targetMimetype, version))
505525
{
506526
if (version == EPUBVersion.VERSION_3 && reference.type == Type.PICTURE_SOURCE)
507527
{
508528
report.message(MessageId.MED_007, reference.location,
509-
container.relativize(reference.targetDoc), mimetype);
529+
container.relativize(reference.targetDoc), targetMimetype);
510530
return;
511531
}
512532
else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
513533
{
514534
report.message(MessageId.MED_003, reference.location,
515-
container.relativize(reference.targetDoc), mimetype);
535+
container.relativize(reference.targetDoc), targetMimetype);
516536
}
517537
}
518538
break;
519539
case SEARCH_KEY:
520540
// TODO update when we support EPUB CFI
521-
if ((reference.url.fragment() == null || !reference.url.fragment().startsWith("epubcfi("))
541+
if ((!fragment.exists() || !"epubcfi".equals(fragment.getScheme()))
522542
&& !targetResource.isInSpine())
523543
{
524544
report.message(MessageId.RSC_021, reference.location,
@@ -527,7 +547,7 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
527547
}
528548
break;
529549
case STYLESHEET:
530-
if (reference.url.fragment() != null)
550+
if (fragment.exists())
531551
{
532552
report.message(MessageId.RSC_013,
533553
reference.location.context(container.relativize(reference.url)));
@@ -551,7 +571,7 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
551571
case SVG_CLIP_PATH:
552572
case SVG_PAINT:
553573
case SVG_SYMBOL:
554-
if (reference.url.fragment() == null)
574+
if (!fragment.exists())
555575
{
556576
report.message(MessageId.RSC_015, reference.location.context(reference.url));
557577
return;
@@ -562,32 +582,32 @@ else if (reference.type == Type.IMAGE && !targetResource.hasImageFallback())
562582
}
563583

564584
// Fragment integrity checks
565-
String fragment = reference.url.fragment();
566-
if (fragment != null && !fragment.isEmpty())
585+
if (fragment.exists() && !fragment.isEmpty())
567586
{
568587
// EPUB CFI
569-
if (fragment.startsWith("epubcfi("))
588+
if ("epubcfi".equals(fragment.getScheme()))
570589
{
590+
// FIXME HOT should warn if in MO
571591
// FIXME epubcfi currently not supported (see issue 150).
572592
return;
573593
}
574594
// Media fragments in Data Navigation Documents
575-
else if (fragment.contains("=") && hostResource != null && hostResource.hasItem()
595+
else if (fragment.isMediaFragment() && hostResource != null && hostResource.hasItem()
576596
&& hostResource.getItem().getProperties()
577597
.contains(PackageVocabs.ITEM_VOCAB.get(PackageVocabs.ITEM_PROPERTIES.DATA_NAV)))
578598
{
579599
// Ignore,
580600
return;
581601
}
582-
// SVG view fragments are ignored
583-
else if (mimetype.equals("image/svg+xml") && REGEX_SVG_VIEW.matcher(fragment).matches())
602+
// Non-ID-based fragments are ignored
603+
else if (fragment.getId().isEmpty())
584604
{
585605
return;
586606
}
587607
// Fragment Identifier (by default)
588608
else if (!container.isRemote(reference.targetDoc))
589609
{
590-
ID anchor = targetResource.ids.get(fragment);
610+
ID anchor = targetResource.ids.get(fragment.getId());
591611
if (anchor == null)
592612
{
593613
report.message(MessageId.RSC_012, reference.location.context(reference.url.toString()));
@@ -674,7 +694,8 @@ private void checkReadingOrder(Queue<URLReference> references, int lastSpinePosi
674694
}
675695

676696
// check that the fragment is in document order
677-
int targetAnchorPosition = res.getIDPosition(ref.url.fragment());
697+
URLFragment fragment = URLFragment.parse(ref.url, res.getMimeType());
698+
int targetAnchorPosition = res.getIDPosition(fragment.getId());
678699
if (targetAnchorPosition < lastAnchorPosition)
679700
{
680701
String orderContext = LocalizedMessages.getInstance(locale).getSuggestion(MessageId.NAV_011,

src/main/java/com/adobe/epubcheck/ops/OPSHandler.java

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,6 @@ else if (".".equals(href))
141141

142142
// If the URL was not properly parsed, return early
143143
if (url == null) return;
144-
// If the URL is an EPUB CFI, return (not implemented)
145-
if (url.fragment() != null && url.fragment().matches("epubcfi\\(.*\\)"))
146-
{
147-
return; // temp until cfi implemented
148-
}
149144

150145
if ("file".equals(url.scheme()))
151146
{

src/main/java/org/w3c/epubcheck/constants/MIMEType.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,6 @@ public boolean is(String string)
5454

5555
public static MIMEType get(String name)
5656
{
57-
return ENUM_MAP.getOrDefault(name.toLowerCase(Locale.ROOT), OTHER);
57+
return (name != null) ? ENUM_MAP.getOrDefault(name.toLowerCase(Locale.ROOT), OTHER) : OTHER;
5858
}
5959
}

0 commit comments

Comments
 (0)