Skip to content

net/url: parse url error with leading space in url #24246

@smileboywtu

Description

@smileboywtu

Please answer these questions before submitting your issue. Thanks!

What version of Go are you using (go version)?

go version go1.9.2 darwin/amd64

Does this issue reproduce with the latest release?

YES

What operating system and processor architecture are you using (go env)?

darwin/amd64

What did you do?

use goquery to crawl web pages

If possible, provide a recipe for reproducing the error.
A complete runnable program is good.
A link on play.golang.org is best.

url := " https://zhidao.baidu.com/special/view?id=49105a24626975510000&preview=1"
resp, _, rerr := client.Get(url).Retry(2, 1*time.Second).End()
if rerr != nil {
    fmt.Println("->", url)
    fmt.Println(strings.Split(url, ":")[1])
    fmt.Println(strings.Index(url, ":"))
    log.Println(rerr)
}

What did you expect to see?

no error happens

What did you see instead?

[parse http://zhidao.baidu.com/special/view?id=a9105a24626975510000&preview=1: first path segment in URL cannot contain colon]

fix recommends

// net/url
// Maybe rawurl is of the form scheme:path.
// (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
// If so, return scheme, path; else return "", rawurl.
func getscheme(rawurl string) (scheme, path string, err error) {
	for i := 0; i < len(rawurl); i++ {
		c := rawurl[i]
		switch {
		case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
		// do nothing
		case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
			if i == 0 {
				return "", rawurl, nil
			}
		case c == ':':
			if i == 0 {
				return "", "", errors.New("missing protocol scheme")
			}
			return rawurl[:i], rawurl[i+1:], nil
		default:
			// we have encountered an invalid character,
			// so there is no valid scheme
			return "", rawurl, nil
		}
	}
	return "", rawurl, nil
}

if leading character is space character, then continue, start to check when first character is not space.

or you can just use TrimSpace before invoke the getscheme method.

// parse parses a URL from a string in one of two contexts. If
// viaRequest is true, the URL is assumed to have arrived via an HTTP request,
// in which case only absolute URLs or path-absolute relative URLs are allowed.
// If viaRequest is false, all forms of relative URLs are allowed.
func parse(rawurl string, viaRequest bool) (*URL, error) {
	var rest string
	var err error

	if rawurl == "" && viaRequest {
		return nil, errors.New("empty url")
	}
	url := new(URL)

	if rawurl == "*" {
		url.Path = "*"
		return url, nil
	}

	// Split off possible leading "http:", "mailto:", etc.
	// Cannot contain escaped characters.
	if url.Scheme, rest, err = getscheme(rawurl); err != nil {
		return nil, err
	}
	url.Scheme = strings.ToLower(url.Scheme)

	if strings.HasSuffix(rest, "?") && strings.Count(rest, "?") == 1 {
		url.ForceQuery = true
		rest = rest[:len(rest)-1]
	} else {
		rest, url.RawQuery = split(rest, "?", true)
	}

	if !strings.HasPrefix(rest, "/") {
		if url.Scheme != "" {
			// We consider rootless paths per RFC 3986 as opaque.
			url.Opaque = rest
			return url, nil
		}
		if viaRequest {
			return nil, errors.New("invalid URI for request")
		}

		// Avoid confusion with malformed schemes, like cache_object:foo/bar.
		// See golang.org/issue/16822.
		//
		// RFC 3986, §3.3:
		// In addition, a URI reference (Section 4.1) may be a relative-path reference,
		// in which case the first path segment cannot contain a colon (":") character.
		colon := strings.Index(rest, ":")
		slash := strings.Index(rest, "/")
		if colon >= 0 && (slash < 0 || colon < slash) {
			// First path segment has colon. Not allowed in relative URL.
			return nil, errors.New("first path segment in URL cannot contain colon")
		}
	}

	if (url.Scheme != "" || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") {
		var authority string
		authority, rest = split(rest[2:], "/", false)
		url.User, url.Host, err = parseAuthority(authority)
		if err != nil {
			return nil, err
		}
	}
	// Set Path and, optionally, RawPath.
	// RawPath is a hint of the encoding of Path. We don't want to set it if
	// the default escaping of Path is equivalent, to help make sure that people
	// don't rely on it in general.
	if err := url.setPath(rest); err != nil {
		return nil, err
	}
	return url, nil
}

just add strings.TrimSpace(rawurl) to 477 line inside net/url source file.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions