2018-11-18 11:08:38 +01:00
|
|
|
package beater
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2018-11-18 15:32:28 +01:00
|
|
|
"github.com/gocolly/colly"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
2018-11-18 11:08:38 +01:00
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/elastic/beats/libbeat/beat"
|
|
|
|
"github.com/elastic/beats/libbeat/common"
|
|
|
|
"github.com/elastic/beats/libbeat/logp"
|
|
|
|
|
|
|
|
"github.com/Crocmagnon/youtubebeat/config"
|
|
|
|
)
|
|
|
|
|
|
|
|
// Youtubebeat configuration.
|
|
|
|
type Youtubebeat struct {
|
|
|
|
done chan struct{}
|
|
|
|
config config.Config
|
|
|
|
client beat.Client
|
|
|
|
}
|
|
|
|
|
|
|
|
// New creates an instance of youtubebeat.
|
|
|
|
func New(b *beat.Beat, cfg *common.Config) (beat.Beater, error) {
|
|
|
|
c := config.DefaultConfig
|
|
|
|
if err := cfg.Unpack(&c); err != nil {
|
|
|
|
return nil, fmt.Errorf("Error reading config file: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
bt := &Youtubebeat{
|
|
|
|
done: make(chan struct{}),
|
|
|
|
config: c,
|
|
|
|
}
|
|
|
|
return bt, nil
|
|
|
|
}
|
|
|
|
|
2018-11-19 18:21:56 +01:00
|
|
|
const BaseUrl = "https://www.youtube.com"
|
|
|
|
const BaseSuffix = "/watch?v="
|
2018-11-18 15:32:28 +01:00
|
|
|
|
2018-11-20 10:21:24 +01:00
|
|
|
func scrapeVideos(b *beat.Beat, bt *Youtubebeat, done chan bool) {
|
2018-11-18 15:32:28 +01:00
|
|
|
videoCollector := colly.NewCollector(
|
|
|
|
colly.AllowedDomains("youtube.com", "www.youtube.com"),
|
|
|
|
colly.Async(true),
|
2018-11-20 10:21:24 +01:00
|
|
|
colly.MaxDepth(bt.config.MaxDepth),
|
2018-11-18 15:32:28 +01:00
|
|
|
)
|
2018-11-20 10:21:24 +01:00
|
|
|
videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: bt.config.Parallelism})
|
2018-11-20 09:35:31 +01:00
|
|
|
videoCollector.AllowURLRevisit = true
|
2018-11-18 15:32:28 +01:00
|
|
|
|
|
|
|
videoCollector.OnHTML("body", func(e *colly.HTMLElement) {
|
|
|
|
url := e.Request.URL.String()
|
|
|
|
isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content")
|
|
|
|
if isPaid == "True" {
|
2018-11-20 10:21:24 +01:00
|
|
|
logp.Warn("Not parsing video because of isPaid" + url)
|
2018-11-18 15:32:28 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
title := e.ChildAttr("meta[itemprop=\"name\"]", "content")
|
|
|
|
if title == "YouTube" {
|
2018-11-20 10:21:24 +01:00
|
|
|
logp.Warn("Not parsing video because of title " + url)
|
2018-11-18 15:32:28 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64)
|
|
|
|
if err != nil {
|
2018-11-20 10:21:24 +01:00
|
|
|
logp.Warn("Can't parse view count for URL " + url)
|
2018-11-18 15:32:28 +01:00
|
|
|
return
|
|
|
|
}
|
|
|
|
date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content")
|
2018-11-20 11:45:41 +01:00
|
|
|
genre := e.ChildAttr("meta[itemprop=\"genre\"]", "content")
|
2018-11-18 15:32:28 +01:00
|
|
|
fields := common.MapStr{
|
|
|
|
"url": url,
|
|
|
|
"title": title,
|
|
|
|
"views": views,
|
|
|
|
"date": date,
|
2018-11-20 11:45:41 +01:00
|
|
|
"genre": genre,
|
2018-11-20 10:21:24 +01:00
|
|
|
"type": b.Info.Name,
|
2018-11-18 15:32:28 +01:00
|
|
|
}
|
2018-11-20 10:21:24 +01:00
|
|
|
event := beat.Event{
|
|
|
|
Timestamp: time.Now(),
|
|
|
|
Fields: fields,
|
|
|
|
}
|
|
|
|
|
|
|
|
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
|
|
|
|
event.SetID(id)
|
|
|
|
bt.client.Publish(event)
|
|
|
|
logp.Info("Event sent")
|
2018-11-18 15:32:28 +01:00
|
|
|
})
|
|
|
|
|
|
|
|
videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
|
|
|
href := e.Attr("href")
|
|
|
|
if strings.HasPrefix(href, BaseSuffix) {
|
|
|
|
e.Request.Visit(BaseUrl + href)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
|
2018-11-20 14:48:22 +01:00
|
|
|
videoCollector.Visit(bt.config.StartUrl)
|
2018-11-18 15:32:28 +01:00
|
|
|
videoCollector.Wait()
|
2018-11-20 10:29:28 +01:00
|
|
|
logp.Info("Done parsing all videos")
|
2018-11-18 18:46:54 +01:00
|
|
|
done <- true
|
2018-11-18 15:32:28 +01:00
|
|
|
}
|
|
|
|
|
2018-11-18 11:08:38 +01:00
|
|
|
// Run starts youtubebeat.
|
|
|
|
func (bt *Youtubebeat) Run(b *beat.Beat) error {
|
|
|
|
logp.Info("youtubebeat is running! Hit CTRL-C to stop it.")
|
|
|
|
|
|
|
|
var err error
|
|
|
|
bt.client, err = b.Publisher.Connect()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-11-18 18:46:54 +01:00
|
|
|
done := make(chan bool)
|
2018-11-20 10:21:24 +01:00
|
|
|
go scrapeVideos(b, bt, done)
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-done:
|
2018-11-20 10:29:28 +01:00
|
|
|
logp.Info("Exiting on done parsing")
|
2018-11-20 10:21:24 +01:00
|
|
|
return nil
|
|
|
|
case <-bt.done:
|
2018-11-20 10:29:28 +01:00
|
|
|
logp.Info("Exiting on beat stop signal")
|
2018-11-20 10:21:24 +01:00
|
|
|
return nil
|
2018-11-18 11:08:38 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Stop stops youtubebeat.
|
|
|
|
func (bt *Youtubebeat) Stop() {
|
|
|
|
bt.client.Close()
|
|
|
|
close(bt.done)
|
|
|
|
}
|