parsers/web: Remove hash reference when parsing URL.

Fixes #8.
develop
Icedream 2016-06-20 02:43:30 +02:00
parent 280da493fb
commit dc5597c054
2 changed files with 22 additions and 0 deletions

View File

@ -28,6 +28,7 @@ var (
)
const (
runeHash = '#'
noTitleStr = "(no title)"
maxHtmlSize = 8 * 1024
)
@ -49,6 +50,12 @@ func (p *Parser) Parse(u *url.URL, referer *url.URL) (result parsers.ParseResult
return
}
// Remove hash reference from URL since that's not meant to be in the request
if strings.Contains(u.Path, string(runeHash)) {
u = &(*u) // avoid modifying original URL object
u.Path = u.Path[0:strings.IndexRune(u.Path, runeHash)]
}
// Make request
req, err := http.NewRequest("GET", u.String(), nil)
if err != nil {

View File

@ -116,3 +116,18 @@ func Test_Parser_Parse_IRCBotScience_Redirect(t *testing.T) {
assert.NotNil(t, result.FollowUrl)
assert.Equal(t, *originalUrl, *result.FollowUrl)
}
func Test_Parser_Parse_Hash(t *testing.T) {
p := mustNewParser(t)
originalUrl := &url.URL{
Scheme: "https",
Host: "www.google.com",
Path: "/#invalid",
}
result := p.Parse(originalUrl, nil)
t.Logf("Result: %+v", result)
assert.False(t, result.Ignored)
assert.Nil(t, result.Error)
assert.Nil(t, result.UserError)
}