youtubebeat/beater/youtubebeat.go

129 lines
3.0 KiB
Go
Raw Normal View History

2018-11-18 11:08:38 +01:00
package beater
import (
"fmt"
2018-11-18 15:32:28 +01:00
"github.com/gocolly/colly"
"strconv"
"strings"
2018-11-18 11:08:38 +01:00
"time"
"github.com/elastic/beats/libbeat/beat"
"github.com/elastic/beats/libbeat/common"
"github.com/elastic/beats/libbeat/logp"
"github.com/Crocmagnon/youtubebeat/config"
)
// Youtubebeat configuration.
type Youtubebeat struct {
done chan struct{}
config config.Config
client beat.Client
}
// New creates an instance of youtubebeat.
func New(b *beat.Beat, cfg *common.Config) (beat.Beater, error) {
c := config.DefaultConfig
if err := cfg.Unpack(&c); err != nil {
return nil, fmt.Errorf("Error reading config file: %v", err)
}
bt := &Youtubebeat{
done: make(chan struct{}),
config: c,
}
return bt, nil
}
2018-11-19 18:21:56 +01:00
const BaseUrl = "https://www.youtube.com"
const BaseSuffix = "/watch?v="
2018-11-18 15:32:28 +01:00
func scrapeVideos(b *beat.Beat, bt *Youtubebeat, done chan bool) {
2018-11-18 15:32:28 +01:00
videoCollector := colly.NewCollector(
colly.AllowedDomains("youtube.com", "www.youtube.com"),
colly.Async(true),
colly.MaxDepth(bt.config.MaxDepth),
2018-11-18 15:32:28 +01:00
)
videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: bt.config.Parallelism})
2018-11-20 09:35:31 +01:00
videoCollector.AllowURLRevisit = true
2018-11-18 15:32:28 +01:00
videoCollector.OnHTML("body", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content")
if isPaid == "True" {
logp.Warn("Not parsing video because of isPaid" + url)
2018-11-18 15:32:28 +01:00
return
}
title := e.ChildAttr("meta[itemprop=\"name\"]", "content")
if title == "YouTube" {
logp.Warn("Not parsing video because of title " + url)
2018-11-18 15:32:28 +01:00
return
}
views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64)
if err != nil {
logp.Warn("Can't parse view count for URL " + url)
2018-11-18 15:32:28 +01:00
return
}
date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content")
genre := e.ChildAttr("meta[itemprop=\"genre\"]", "content")
2018-11-18 15:32:28 +01:00
fields := common.MapStr{
"url": url,
"title": title,
"views": views,
"date": date,
"genre": genre,
"type": b.Info.Name,
2018-11-18 15:32:28 +01:00
}
event := beat.Event{
Timestamp: time.Now(),
Fields: fields,
}
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
event.SetID(id)
bt.client.Publish(event)
logp.Info("Event sent")
2018-11-18 15:32:28 +01:00
})
videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
href := e.Attr("href")
if strings.HasPrefix(href, BaseSuffix) {
e.Request.Visit(BaseUrl + href)
}
})
2018-11-20 14:48:22 +01:00
videoCollector.Visit(bt.config.StartUrl)
2018-11-18 15:32:28 +01:00
videoCollector.Wait()
2018-11-20 10:29:28 +01:00
logp.Info("Done parsing all videos")
done <- true
2018-11-18 15:32:28 +01:00
}
2018-11-18 11:08:38 +01:00
// Run starts youtubebeat.
func (bt *Youtubebeat) Run(b *beat.Beat) error {
logp.Info("youtubebeat is running! Hit CTRL-C to stop it.")
var err error
bt.client, err = b.Publisher.Connect()
if err != nil {
return err
}
done := make(chan bool)
go scrapeVideos(b, bt, done)
select {
case <-done:
2018-11-20 10:29:28 +01:00
logp.Info("Exiting on done parsing")
return nil
case <-bt.done:
2018-11-20 10:29:28 +01:00
logp.Info("Exiting on beat stop signal")
return nil
2018-11-18 11:08:38 +01:00
}
}
// Stop stops youtubebeat.
func (bt *Youtubebeat) Stop() {
bt.client.Close()
close(bt.done)
}