From 6775fe51003d194541b6d7d4bade911e52d71f0c Mon Sep 17 00:00:00 2001 From: Carl Kittelberger Date: Sun, 19 Jun 2016 23:05:51 +0200 Subject: [PATCH] parsers/web: Limit HTML parsing to first 8 kB and use Content-Length header. Targets #2. --- parsers/web/parser.go | 32 +++++++++++++++------ util/limitedio/limited_reader.go | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 8 deletions(-) create mode 100644 util/limitedio/limited_reader.go diff --git a/parsers/web/parser.go b/parsers/web/parser.go index 5242323..e2ae317 100644 --- a/parsers/web/parser.go +++ b/parsers/web/parser.go @@ -16,6 +16,7 @@ import ( _ "image/png" "github.com/icedream/irc-medialink/parsers" + "github.com/icedream/irc-medialink/util/limitedio" "github.com/yhat/scrape" ) @@ -23,6 +24,10 @@ var ( ErrCorruptedImage = errors.New("Corrupted image.") ) +const ( + maxHtmlSize = 8 * 1024 +) + type Parser struct{} func (p *Parser) Init() error { @@ -79,26 +84,37 @@ func (p *Parser) Parse(u *url.URL, referer *url.URL) (result parsers.ParseResult switch strings.ToLower(contentType[0:sep]) { case "text/html": // Parse the page - root, err := html.Parse(resp.Body) + var contentLength int + if resp.ContentLength < 0 || resp.ContentLength > maxHtmlSize { + contentLength = maxHtmlSize + } else { + contentLength = int(resp.ContentLength) + } + limitedBody := limitedio.NewLimitedReader(resp.Body, contentLength) + root, err := html.Parse(limitedBody) if err != nil { result.Error = err return } // Search for the title + result.Information = []map[string]interface{}{ + map[string]interface{}{ + "IsUpload": false, + }, + } title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) if ok { // Got it! - result.Information = []map[string]interface{}{ - map[string]interface{}{ - "IsUpload": false, - "Title": scrape.Text(title), - }, - } + result.Information[0]["Title"] = scrape.Text(title) } else { - result.Ignored = true + // No title found + result.Information[0]["Title"] = "(no title)" } case "image/png", "image/jpeg", "image/gif": log.Print("Parsing image...") + + // No need to limit the reader to a specific size here as + // image.DecodeConfig only reads as much as needed anyways. if m, imgType, err := image.DecodeConfig(resp.Body); err != nil { result.UserError = ErrCorruptedImage } else { diff --git a/util/limitedio/limited_reader.go b/util/limitedio/limited_reader.go new file mode 100644 index 0000000..6c0ba39 --- /dev/null +++ b/util/limitedio/limited_reader.go @@ -0,0 +1,48 @@ +package limitedio + +import "io" + +type limitedReader struct { + io.Reader + rest int +} + +func NewLimitedReader(r io.Reader, limit int) io.Reader { + return &limitedReader{r, limit} +} + +func (r *limitedReader) Read(data []byte) (n int, err error) { + if r.rest <= 0 { + err = io.EOF + return + } + + var dataSize int + if len(data) < r.rest { + dataSize = len(data) + } else { + dataSize = r.rest + } + + actualData := make([]byte, dataSize) + n, err = r.Reader.Read(actualData) + if n > 0 { + copy(data, actualData) + } + r.rest -= (n) + + return +} + +type limitedReadCloser struct { + *limitedReader + closeMethod func() error +} + +func NewLimitedReadCloser(r io.ReadCloser, limit int) io.Reader { + return &limitedReadCloser{&limitedReader{r, limit}, r.Close} +} + +func (rc *limitedReadCloser) Close() error { + return rc.closeMethod() +}