-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Closed
Labels
Description
Please answer these questions before submitting your issue. Thanks!
What version of Go are you using (go version
)?
go version go1.9.2 darwin/amd64
Does this issue reproduce with the latest release?
YES
What operating system and processor architecture are you using (go env
)?
darwin/amd64
What did you do?
use goquery to crawl web pages
If possible, provide a recipe for reproducing the error.
A complete runnable program is good.
A link on play.golang.org is best.
url := " https://zhidao.baidu.com/special/view?id=49105a24626975510000&preview=1"
resp, _, rerr := client.Get(url).Retry(2, 1*time.Second).End()
if rerr != nil {
fmt.Println("->", url)
fmt.Println(strings.Split(url, ":")[1])
fmt.Println(strings.Index(url, ":"))
log.Println(rerr)
}
What did you expect to see?
no error happens
What did you see instead?
[parse http://zhidao.baidu.com/special/view?id=a9105a24626975510000&preview=1: first path segment in URL cannot contain colon]
fix recommends
// net/url
// Maybe rawurl is of the form scheme:path.
// (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
// If so, return scheme, path; else return "", rawurl.
func getscheme(rawurl string) (scheme, path string, err error) {
for i := 0; i < len(rawurl); i++ {
c := rawurl[i]
switch {
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
// do nothing
case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
if i == 0 {
return "", rawurl, nil
}
case c == ':':
if i == 0 {
return "", "", errors.New("missing protocol scheme")
}
return rawurl[:i], rawurl[i+1:], nil
default:
// we have encountered an invalid character,
// so there is no valid scheme
return "", rawurl, nil
}
}
return "", rawurl, nil
}
if leading character is space character, then continue, start to check when first character is not space.
or you can just use TrimSpace
before invoke the getscheme
method.
// parse parses a URL from a string in one of two contexts. If
// viaRequest is true, the URL is assumed to have arrived via an HTTP request,
// in which case only absolute URLs or path-absolute relative URLs are allowed.
// If viaRequest is false, all forms of relative URLs are allowed.
func parse(rawurl string, viaRequest bool) (*URL, error) {
var rest string
var err error
if rawurl == "" && viaRequest {
return nil, errors.New("empty url")
}
url := new(URL)
if rawurl == "*" {
url.Path = "*"
return url, nil
}
// Split off possible leading "http:", "mailto:", etc.
// Cannot contain escaped characters.
if url.Scheme, rest, err = getscheme(rawurl); err != nil {
return nil, err
}
url.Scheme = strings.ToLower(url.Scheme)
if strings.HasSuffix(rest, "?") && strings.Count(rest, "?") == 1 {
url.ForceQuery = true
rest = rest[:len(rest)-1]
} else {
rest, url.RawQuery = split(rest, "?", true)
}
if !strings.HasPrefix(rest, "/") {
if url.Scheme != "" {
// We consider rootless paths per RFC 3986 as opaque.
url.Opaque = rest
return url, nil
}
if viaRequest {
return nil, errors.New("invalid URI for request")
}
// Avoid confusion with malformed schemes, like cache_object:foo/bar.
// See golang.org/issue/16822.
//
// RFC 3986, §3.3:
// In addition, a URI reference (Section 4.1) may be a relative-path reference,
// in which case the first path segment cannot contain a colon (":") character.
colon := strings.Index(rest, ":")
slash := strings.Index(rest, "/")
if colon >= 0 && (slash < 0 || colon < slash) {
// First path segment has colon. Not allowed in relative URL.
return nil, errors.New("first path segment in URL cannot contain colon")
}
}
if (url.Scheme != "" || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") {
var authority string
authority, rest = split(rest[2:], "/", false)
url.User, url.Host, err = parseAuthority(authority)
if err != nil {
return nil, err
}
}
// Set Path and, optionally, RawPath.
// RawPath is a hint of the encoding of Path. We don't want to set it if
// the default escaping of Path is equivalent, to help make sure that people
// don't rely on it in general.
if err := url.setPath(rest); err != nil {
return nil, err
}
return url, nil
}
just add strings.TrimSpace(rawurl)
to 477 line inside net/url
source file.