Extract parallelism and max depth to config

This commit is contained in:
Gabriel Augendre 2018-11-20 10:21:24 +01:00
parent 0ea086dc43
commit e8eb418e0b
5 changed files with 38 additions and 42 deletions

View File

@ -6,6 +6,8 @@ youtubebeat:
# Defines how often an event is sent to the output # Defines how often an event is sent to the output
period: 1s period: 1s
start_id: "SmBCZgcGlKk" start_id: "SmBCZgcGlKk"
parallelism: 5
max_depth: 10
output.elasticsearch.index: "youtubebeat" output.elasticsearch.index: "youtubebeat"

View File

@ -38,30 +38,30 @@ func New(b *beat.Beat, cfg *common.Config) (beat.Beater, error) {
const BaseUrl = "https://www.youtube.com" const BaseUrl = "https://www.youtube.com"
const BaseSuffix = "/watch?v=" const BaseSuffix = "/watch?v="
func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan bool) { func scrapeVideos(b *beat.Beat, bt *Youtubebeat, done chan bool) {
videoCollector := colly.NewCollector( videoCollector := colly.NewCollector(
colly.AllowedDomains("youtube.com", "www.youtube.com"), colly.AllowedDomains("youtube.com", "www.youtube.com"),
colly.Async(true), colly.Async(true),
colly.MaxDepth(10), colly.MaxDepth(bt.config.MaxDepth),
) )
videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 5}) videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: bt.config.Parallelism})
videoCollector.AllowURLRevisit = true videoCollector.AllowURLRevisit = true
videoCollector.OnHTML("body", func(e *colly.HTMLElement) { videoCollector.OnHTML("body", func(e *colly.HTMLElement) {
url := e.Request.URL.String() url := e.Request.URL.String()
isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content") isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content")
if isPaid == "True" { if isPaid == "True" {
logp.Err("Not parsing video because of isPaid" + url) logp.Warn("Not parsing video because of isPaid" + url)
return return
} }
title := e.ChildAttr("meta[itemprop=\"name\"]", "content") title := e.ChildAttr("meta[itemprop=\"name\"]", "content")
if title == "YouTube" { if title == "YouTube" {
logp.Err("Not parsing video because of title " + url) logp.Warn("Not parsing video because of title " + url)
return return
} }
views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64) views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64)
if err != nil { if err != nil {
logp.Err("Can't parse view count for URL " + url) logp.Warn("Can't parse view count for URL " + url)
return return
} }
date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content") date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content")
@ -70,8 +70,18 @@ func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan boo
"title": title, "title": title,
"views": views, "views": views,
"date": date, "date": date,
"type": b.Info.Name,
} }
fieldsToSend <- fields event := beat.Event{
Timestamp: time.Now(),
Fields: fields,
}
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
event.SetID(id)
bt.client.Publish(event)
logp.Info("Event sent")
}) })
videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) { videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
@ -81,7 +91,7 @@ func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan boo
} }
}) })
videoCollector.Visit(BaseUrl + BaseSuffix + startId) videoCollector.Visit(BaseUrl + BaseSuffix + bt.config.StartId)
videoCollector.Wait() videoCollector.Wait()
done <- true done <- true
} }
@ -96,38 +106,14 @@ func (bt *Youtubebeat) Run(b *beat.Beat) error {
return err return err
} }
fieldsToSend := make(chan common.MapStr)
done := make(chan bool) done := make(chan bool)
go scrapeVideos(bt.config.StartId, fieldsToSend, done) go scrapeVideos(b, bt, done)
ticker := time.NewTicker(bt.config.Period) select {
for { case <-done:
select { return nil
case <-done: case <-bt.done:
return nil return nil
case <-bt.done:
return nil
case <-ticker.C:
}
// Handle a SIGINT even when no more videos to fetch
select {
case <-done:
return nil
case <-bt.done:
return nil
case fields := <-fieldsToSend:
fields["type"] = b.Info.Name
event := beat.Event{
Timestamp: time.Now(),
Fields: fields,
}
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
event.SetID(id)
bt.client.Publish(event)
logp.Info("Event sent")
}
} }
} }

View File

@ -6,11 +6,15 @@ package config
import "time" import "time"
type Config struct { type Config struct {
Period time.Duration `config:"period"` Period time.Duration `config:"period"`
StartId string `config:"start_id"` StartId string `config:"start_id"`
Parallelism int `config:"parallelism"`
MaxDepth int `config:"max_depth"`
} }
var DefaultConfig = Config{ var DefaultConfig = Config{
Period: 1 * time.Second, Period: 1 * time.Second,
StartId: "SmBCZgcGlKk", StartId: "SmBCZgcGlKk",
Parallelism: 5,
MaxDepth: 10,
} }

View File

@ -6,6 +6,8 @@ youtubebeat:
# Defines how often an event is sent to the output # Defines how often an event is sent to the output
period: 1s period: 1s
start_id: "SmBCZgcGlKk" start_id: "SmBCZgcGlKk"
parallelism: 5
max_depth: 10
output.elasticsearch.index: "youtubebeat" output.elasticsearch.index: "youtubebeat"

View File

@ -6,6 +6,8 @@ youtubebeat:
# Defines how often an event is sent to the output # Defines how often an event is sent to the output
period: 1s period: 1s
start_id: "SmBCZgcGlKk" start_id: "SmBCZgcGlKk"
parallelism: 5
max_depth: 10
output.elasticsearch.index: "youtubebeat" output.elasticsearch.index: "youtubebeat"