2016-06-11 14:08:13 +00:00
|
|
|
package web
|
|
|
|
|
|
|
|
import (
|
|
|
|
"errors"
|
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"net/url"
|
2016-06-19 21:31:47 +00:00
|
|
|
"regexp"
|
2016-06-11 14:08:13 +00:00
|
|
|
"strings"
|
|
|
|
|
|
|
|
"golang.org/x/net/html"
|
|
|
|
"golang.org/x/net/html/atom"
|
|
|
|
|
|
|
|
"image"
|
|
|
|
_ "image/gif"
|
|
|
|
_ "image/jpeg"
|
|
|
|
_ "image/png"
|
|
|
|
|
|
|
|
"github.com/icedream/irc-medialink/parsers"
|
2016-06-19 21:05:51 +00:00
|
|
|
"github.com/icedream/irc-medialink/util/limitedio"
|
2016-06-11 14:08:13 +00:00
|
|
|
"github.com/yhat/scrape"
|
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
ErrCorruptedImage = errors.New("Corrupted image.")
|
2016-06-19 21:31:47 +00:00
|
|
|
|
2016-06-19 21:34:57 +00:00
|
|
|
rxNewlines = regexp.MustCompile(`(?:\r?\n)+`)
|
2016-06-11 14:08:13 +00:00
|
|
|
)
|
|
|
|
|
2016-06-19 21:05:51 +00:00
|
|
|
const (
|
2016-06-20 00:43:30 +00:00
|
|
|
runeHash = '#'
|
2016-06-19 21:32:27 +00:00
|
|
|
noTitleStr = "(no title)"
|
2016-06-19 21:05:51 +00:00
|
|
|
maxHtmlSize = 8 * 1024
|
|
|
|
)
|
|
|
|
|
2016-06-11 14:08:13 +00:00
|
|
|
type Parser struct{}
|
|
|
|
|
|
|
|
func (p *Parser) Init() error {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *Parser) Name() string {
|
|
|
|
return "Web"
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *Parser) Parse(u *url.URL, referer *url.URL) (result parsers.ParseResult) {
|
|
|
|
// Ignore non-HTTP link
|
|
|
|
if !strings.EqualFold(u.Scheme, "http") && !strings.EqualFold(u.Scheme, "https") {
|
|
|
|
result.Ignored = true
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2016-06-20 00:43:30 +00:00
|
|
|
// Remove hash reference from URL since that's not meant to be in the request
|
|
|
|
if strings.Contains(u.Path, string(runeHash)) {
|
|
|
|
u = &(*u) // avoid modifying original URL object
|
|
|
|
u.Path = u.Path[0:strings.IndexRune(u.Path, runeHash)]
|
|
|
|
}
|
|
|
|
|
2016-06-11 14:08:13 +00:00
|
|
|
// Make request
|
|
|
|
req, err := http.NewRequest("GET", u.String(), nil)
|
|
|
|
if err != nil {
|
|
|
|
result.Error = err
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if referer != nil {
|
|
|
|
req.Header.Set("Referer", referer.String())
|
|
|
|
}
|
|
|
|
if resp, err := http.DefaultTransport.RoundTrip(req); err != nil {
|
|
|
|
result.Error = err
|
|
|
|
return
|
|
|
|
} else {
|
|
|
|
defer resp.Body.Close()
|
|
|
|
if 300 <= resp.StatusCode && resp.StatusCode < 400 {
|
|
|
|
if u2, err := resp.Location(); err == nil && u2 != nil && *u2 != *u {
|
|
|
|
result.FollowUrl = u2
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if resp.StatusCode >= 400 {
|
|
|
|
result.UserError = errors.New(resp.Status)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if resp.StatusCode != 200 {
|
|
|
|
result.Ignored = true
|
|
|
|
return
|
|
|
|
}
|
|
|
|
contentType := resp.Header.Get("content-type")
|
|
|
|
sep := strings.IndexRune(contentType, ';')
|
|
|
|
if sep < 0 {
|
|
|
|
sep = len(contentType)
|
|
|
|
}
|
|
|
|
switch strings.ToLower(contentType[0:sep]) {
|
|
|
|
case "text/html":
|
|
|
|
// Parse the page
|
2016-06-19 21:05:51 +00:00
|
|
|
var contentLength int
|
|
|
|
if resp.ContentLength < 0 || resp.ContentLength > maxHtmlSize {
|
|
|
|
contentLength = maxHtmlSize
|
|
|
|
} else {
|
|
|
|
contentLength = int(resp.ContentLength)
|
|
|
|
}
|
|
|
|
limitedBody := limitedio.NewLimitedReader(resp.Body, contentLength)
|
|
|
|
root, err := html.Parse(limitedBody)
|
2016-06-11 14:08:13 +00:00
|
|
|
if err != nil {
|
|
|
|
result.Error = err
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// Search for the title
|
2016-06-19 21:05:51 +00:00
|
|
|
result.Information = []map[string]interface{}{
|
|
|
|
map[string]interface{}{
|
|
|
|
"IsUpload": false,
|
|
|
|
},
|
|
|
|
}
|
2016-06-11 14:08:13 +00:00
|
|
|
title, ok := scrape.Find(root, scrape.ByTag(atom.Title))
|
|
|
|
if ok {
|
|
|
|
// Got it!
|
2016-06-19 21:31:47 +00:00
|
|
|
result.Information[0]["Title"] = rxNewlines.ReplaceAllString(scrape.Text(title), " ")
|
2016-06-11 14:08:13 +00:00
|
|
|
} else {
|
2016-06-19 21:05:51 +00:00
|
|
|
// No title found
|
2016-06-19 21:32:27 +00:00
|
|
|
result.Information[0]["Title"] = noTitleStr
|
2016-06-11 14:08:13 +00:00
|
|
|
}
|
|
|
|
case "image/png", "image/jpeg", "image/gif":
|
2016-06-19 21:05:51 +00:00
|
|
|
|
|
|
|
// No need to limit the reader to a specific size here as
|
|
|
|
// image.DecodeConfig only reads as much as needed anyways.
|
2016-06-11 14:08:13 +00:00
|
|
|
if m, imgType, err := image.DecodeConfig(resp.Body); err != nil {
|
|
|
|
result.UserError = ErrCorruptedImage
|
|
|
|
} else {
|
|
|
|
info := map[string]interface{}{
|
|
|
|
"IsUpload": true,
|
|
|
|
"ImageSize": image.Point{X: m.Width, Y: m.Height},
|
|
|
|
"ImageType": strings.ToUpper(imgType),
|
|
|
|
}
|
|
|
|
if resp.ContentLength > 0 {
|
|
|
|
info["Size"] = uint64(resp.ContentLength)
|
|
|
|
}
|
|
|
|
result.Information = []map[string]interface{}{info}
|
|
|
|
log.Printf("Got through: %+v!", info)
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
// TODO - Implement generic head info?
|
|
|
|
result.Ignored = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|