From dc5597c054a38bac5625d6c5c0e39ab003a3aae5 Mon Sep 17 00:00:00 2001 From: Carl Kittelberger Date: Mon, 20 Jun 2016 02:43:30 +0200 Subject: [PATCH] parsers/web: Remove hash reference when parsing URL. Fixes #8. --- parsers/web/parser.go | 7 +++++++ parsers/web/parser_test.go | 15 +++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/parsers/web/parser.go b/parsers/web/parser.go index f55d221..5e2d81c 100644 --- a/parsers/web/parser.go +++ b/parsers/web/parser.go @@ -28,6 +28,7 @@ var ( ) const ( + runeHash = '#' noTitleStr = "(no title)" maxHtmlSize = 8 * 1024 ) @@ -49,6 +50,12 @@ func (p *Parser) Parse(u *url.URL, referer *url.URL) (result parsers.ParseResult return } + // Remove hash reference from URL since that's not meant to be in the request + if strings.Contains(u.Path, string(runeHash)) { + u = &(*u) // avoid modifying original URL object + u.Path = u.Path[0:strings.IndexRune(u.Path, runeHash)] + } + // Make request req, err := http.NewRequest("GET", u.String(), nil) if err != nil { diff --git a/parsers/web/parser_test.go b/parsers/web/parser_test.go index 3788133..dfc84ad 100644 --- a/parsers/web/parser_test.go +++ b/parsers/web/parser_test.go @@ -116,3 +116,18 @@ func Test_Parser_Parse_IRCBotScience_Redirect(t *testing.T) { assert.NotNil(t, result.FollowUrl) assert.Equal(t, *originalUrl, *result.FollowUrl) } + +func Test_Parser_Parse_Hash(t *testing.T) { + p := mustNewParser(t) + originalUrl := &url.URL{ + Scheme: "https", + Host: "www.google.com", + Path: "/#invalid", + } + result := p.Parse(originalUrl, nil) + + t.Logf("Result: %+v", result) + assert.False(t, result.Ignored) + assert.Nil(t, result.Error) + assert.Nil(t, result.UserError) +}