youtubebeat/beater/youtubebeat.go

131 lines
2.9 KiB
Go
Raw Normal View History

2018-11-18 11:08:38 +01:00
package beater
import (
"fmt"
2018-11-18 15:32:28 +01:00
"github.com/gocolly/colly"
"strconv"
"strings"
2018-11-18 11:08:38 +01:00
"time"
"github.com/elastic/beats/libbeat/beat"
"github.com/elastic/beats/libbeat/common"
"github.com/elastic/beats/libbeat/logp"
"github.com/Crocmagnon/youtubebeat/config"
)
// Youtubebeat configuration.
type Youtubebeat struct {
done chan struct{}
config config.Config
client beat.Client
}
// New creates an instance of youtubebeat.
func New(b *beat.Beat, cfg *common.Config) (beat.Beater, error) {
c := config.DefaultConfig
if err := cfg.Unpack(&c); err != nil {
return nil, fmt.Errorf("Error reading config file: %v", err)
}
bt := &Youtubebeat{
done: make(chan struct{}),
config: c,
}
return bt, nil
}
func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan bool) {
2018-11-18 15:32:28 +01:00
const BaseUrl = "https://www.youtube.com"
const BaseSuffix = "/watch?v="
videoCollector := colly.NewCollector(
colly.AllowedDomains("youtube.com", "www.youtube.com"),
colly.Async(true),
colly.MaxDepth(10),
)
videoCollector.OnHTML("body", func(e *colly.HTMLElement) {
url := e.Request.URL.String()
isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content")
if isPaid == "True" {
logp.Err("Not parsing video because of isPaid" + url)
return
}
title := e.ChildAttr("meta[itemprop=\"name\"]", "content")
if title == "YouTube" {
logp.Err("Not parsing video because of title " + url)
return
}
views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64)
if err != nil {
logp.Err("Can't parse view count for URL " + url)
return
}
date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content")
fields := common.MapStr{
"url": url,
"title": title,
"views": views,
"date": date,
}
fieldsToSend <- fields
})
videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
href := e.Attr("href")
if strings.HasPrefix(href, BaseSuffix) {
e.Request.Visit(BaseUrl + href)
}
})
videoCollector.Visit(BaseUrl + BaseSuffix + startId)
videoCollector.Wait()
done <- true
2018-11-18 15:32:28 +01:00
}
2018-11-18 11:08:38 +01:00
// Run starts youtubebeat.
func (bt *Youtubebeat) Run(b *beat.Beat) error {
logp.Info("youtubebeat is running! Hit CTRL-C to stop it.")
var err error
bt.client, err = b.Publisher.Connect()
if err != nil {
return err
}
2018-11-18 15:32:28 +01:00
fieldsToSend := make(chan common.MapStr)
done := make(chan bool)
go scrapeVideos(bt.config.StartId, fieldsToSend, done)
2018-11-18 15:32:28 +01:00
2018-11-18 11:08:38 +01:00
ticker := time.NewTicker(bt.config.Period)
for {
select {
case <-bt.done:
return nil
case <-ticker.C:
}
// Handle a SIGINT even when no more videos to fetch
select {
case <-done:
return nil
case <-bt.done:
return nil
case fields := <-fieldsToSend:
fields["type"] = b.Info.Name
event := beat.Event{
Timestamp: time.Now(),
Fields: fields,
}
bt.client.Publish(event)
logp.Info("Event sent")
2018-11-18 11:08:38 +01:00
}
}
}
// Stop stops youtubebeat.
func (bt *Youtubebeat) Stop() {
bt.client.Close()
close(bt.done)
}