parsers/web: Limit HTML parsing to first 8 kB and use Content-Length header.

Targets #2.
develop
Icedream 2016-06-19 23:05:51 +02:00
parent be2edc845a
commit 6775fe5100
2 changed files with 72 additions and 8 deletions

View File

@ -16,6 +16,7 @@ import (
_ "image/png" _ "image/png"
"github.com/icedream/irc-medialink/parsers" "github.com/icedream/irc-medialink/parsers"
"github.com/icedream/irc-medialink/util/limitedio"
"github.com/yhat/scrape" "github.com/yhat/scrape"
) )
@ -23,6 +24,10 @@ var (
ErrCorruptedImage = errors.New("Corrupted image.") ErrCorruptedImage = errors.New("Corrupted image.")
) )
const (
maxHtmlSize = 8 * 1024
)
type Parser struct{} type Parser struct{}
func (p *Parser) Init() error { func (p *Parser) Init() error {
@ -79,26 +84,37 @@ func (p *Parser) Parse(u *url.URL, referer *url.URL) (result parsers.ParseResult
switch strings.ToLower(contentType[0:sep]) { switch strings.ToLower(contentType[0:sep]) {
case "text/html": case "text/html":
// Parse the page // Parse the page
root, err := html.Parse(resp.Body) var contentLength int
if resp.ContentLength < 0 || resp.ContentLength > maxHtmlSize {
contentLength = maxHtmlSize
} else {
contentLength = int(resp.ContentLength)
}
limitedBody := limitedio.NewLimitedReader(resp.Body, contentLength)
root, err := html.Parse(limitedBody)
if err != nil { if err != nil {
result.Error = err result.Error = err
return return
} }
// Search for the title // Search for the title
result.Information = []map[string]interface{}{
map[string]interface{}{
"IsUpload": false,
},
}
title, ok := scrape.Find(root, scrape.ByTag(atom.Title)) title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
if ok { if ok {
// Got it! // Got it!
result.Information = []map[string]interface{}{ result.Information[0]["Title"] = scrape.Text(title)
map[string]interface{}{
"IsUpload": false,
"Title": scrape.Text(title),
},
}
} else { } else {
result.Ignored = true // No title found
result.Information[0]["Title"] = "(no title)"
} }
case "image/png", "image/jpeg", "image/gif": case "image/png", "image/jpeg", "image/gif":
log.Print("Parsing image...") log.Print("Parsing image...")
// No need to limit the reader to a specific size here as
// image.DecodeConfig only reads as much as needed anyways.
if m, imgType, err := image.DecodeConfig(resp.Body); err != nil { if m, imgType, err := image.DecodeConfig(resp.Body); err != nil {
result.UserError = ErrCorruptedImage result.UserError = ErrCorruptedImage
} else { } else {

View File

@ -0,0 +1,48 @@
package limitedio
import "io"
type limitedReader struct {
io.Reader
rest int
}
func NewLimitedReader(r io.Reader, limit int) io.Reader {
return &limitedReader{r, limit}
}
func (r *limitedReader) Read(data []byte) (n int, err error) {
if r.rest <= 0 {
err = io.EOF
return
}
var dataSize int
if len(data) < r.rest {
dataSize = len(data)
} else {
dataSize = r.rest
}
actualData := make([]byte, dataSize)
n, err = r.Reader.Read(actualData)
if n > 0 {
copy(data, actualData)
}
r.rest -= (n)
return
}
type limitedReadCloser struct {
*limitedReader
closeMethod func() error
}
func NewLimitedReadCloser(r io.ReadCloser, limit int) io.Reader {
return &limitedReadCloser{&limitedReader{r, limit}, r.Close}
}
func (rc *limitedReadCloser) Close() error {
return rc.closeMethod()
}