diff --git a/parsers/web/parser.go b/parsers/web/parser.go index f55d221..5e2d81c 100644 --- a/parsers/web/parser.go +++ b/parsers/web/parser.go @@ -28,6 +28,7 @@ var ( ) const ( + runeHash = '#' noTitleStr = "(no title)" maxHtmlSize = 8 * 1024 ) @@ -49,6 +50,12 @@ func (p *Parser) Parse(u *url.URL, referer *url.URL) (result parsers.ParseResult return } + // Remove hash reference from URL since that's not meant to be in the request + if strings.Contains(u.Path, string(runeHash)) { + u = &(*u) // avoid modifying original URL object + u.Path = u.Path[0:strings.IndexRune(u.Path, runeHash)] + } + // Make request req, err := http.NewRequest("GET", u.String(), nil) if err != nil { diff --git a/parsers/web/parser_test.go b/parsers/web/parser_test.go index 3788133..dfc84ad 100644 --- a/parsers/web/parser_test.go +++ b/parsers/web/parser_test.go @@ -116,3 +116,18 @@ func Test_Parser_Parse_IRCBotScience_Redirect(t *testing.T) { assert.NotNil(t, result.FollowUrl) assert.Equal(t, *originalUrl, *result.FollowUrl) } + +func Test_Parser_Parse_Hash(t *testing.T) { + p := mustNewParser(t) + originalUrl := &url.URL{ + Scheme: "https", + Host: "www.google.com", + Path: "/#invalid", + } + result := p.Parse(originalUrl, nil) + + t.Logf("Result: %+v", result) + assert.False(t, result.Ignored) + assert.Nil(t, result.Error) + assert.Nil(t, result.UserError) +}