Extract parallelism and max depth to config
This commit is contained in:
parent
0ea086dc43
commit
e8eb418e0b
5 changed files with 38 additions and 42 deletions
|
@ -6,6 +6,8 @@ youtubebeat:
|
|||
# Defines how often an event is sent to the output
|
||||
period: 1s
|
||||
start_id: "SmBCZgcGlKk"
|
||||
parallelism: 5
|
||||
max_depth: 10
|
||||
|
||||
output.elasticsearch.index: "youtubebeat"
|
||||
|
||||
|
|
|
@ -38,30 +38,30 @@ func New(b *beat.Beat, cfg *common.Config) (beat.Beater, error) {
|
|||
const BaseUrl = "https://www.youtube.com"
|
||||
const BaseSuffix = "/watch?v="
|
||||
|
||||
func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan bool) {
|
||||
func scrapeVideos(b *beat.Beat, bt *Youtubebeat, done chan bool) {
|
||||
videoCollector := colly.NewCollector(
|
||||
colly.AllowedDomains("youtube.com", "www.youtube.com"),
|
||||
colly.Async(true),
|
||||
colly.MaxDepth(10),
|
||||
colly.MaxDepth(bt.config.MaxDepth),
|
||||
)
|
||||
videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 5})
|
||||
videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: bt.config.Parallelism})
|
||||
videoCollector.AllowURLRevisit = true
|
||||
|
||||
videoCollector.OnHTML("body", func(e *colly.HTMLElement) {
|
||||
url := e.Request.URL.String()
|
||||
isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content")
|
||||
if isPaid == "True" {
|
||||
logp.Err("Not parsing video because of isPaid" + url)
|
||||
logp.Warn("Not parsing video because of isPaid" + url)
|
||||
return
|
||||
}
|
||||
title := e.ChildAttr("meta[itemprop=\"name\"]", "content")
|
||||
if title == "YouTube" {
|
||||
logp.Err("Not parsing video because of title " + url)
|
||||
logp.Warn("Not parsing video because of title " + url)
|
||||
return
|
||||
}
|
||||
views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64)
|
||||
if err != nil {
|
||||
logp.Err("Can't parse view count for URL " + url)
|
||||
logp.Warn("Can't parse view count for URL " + url)
|
||||
return
|
||||
}
|
||||
date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content")
|
||||
|
@ -70,8 +70,18 @@ func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan boo
|
|||
"title": title,
|
||||
"views": views,
|
||||
"date": date,
|
||||
"type": b.Info.Name,
|
||||
}
|
||||
fieldsToSend <- fields
|
||||
event := beat.Event{
|
||||
Timestamp: time.Now(),
|
||||
Fields: fields,
|
||||
}
|
||||
|
||||
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
|
||||
event.SetID(id)
|
||||
|
||||
bt.client.Publish(event)
|
||||
logp.Info("Event sent")
|
||||
})
|
||||
|
||||
videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||
|
@ -81,7 +91,7 @@ func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan boo
|
|||
}
|
||||
})
|
||||
|
||||
videoCollector.Visit(BaseUrl + BaseSuffix + startId)
|
||||
videoCollector.Visit(BaseUrl + BaseSuffix + bt.config.StartId)
|
||||
videoCollector.Wait()
|
||||
done <- true
|
||||
}
|
||||
|
@ -96,38 +106,14 @@ func (bt *Youtubebeat) Run(b *beat.Beat) error {
|
|||
return err
|
||||
}
|
||||
|
||||
fieldsToSend := make(chan common.MapStr)
|
||||
done := make(chan bool)
|
||||
go scrapeVideos(bt.config.StartId, fieldsToSend, done)
|
||||
go scrapeVideos(b, bt, done)
|
||||
|
||||
ticker := time.NewTicker(bt.config.Period)
|
||||
for {
|
||||
select {
|
||||
case <-done:
|
||||
return nil
|
||||
case <-bt.done:
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
}
|
||||
// Handle a SIGINT even when no more videos to fetch
|
||||
select {
|
||||
case <-done:
|
||||
return nil
|
||||
case <-bt.done:
|
||||
return nil
|
||||
case fields := <-fieldsToSend:
|
||||
fields["type"] = b.Info.Name
|
||||
event := beat.Event{
|
||||
Timestamp: time.Now(),
|
||||
Fields: fields,
|
||||
}
|
||||
|
||||
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
|
||||
event.SetID(id)
|
||||
|
||||
bt.client.Publish(event)
|
||||
logp.Info("Event sent")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -8,9 +8,13 @@ import "time"
|
|||
type Config struct {
|
||||
Period time.Duration `config:"period"`
|
||||
StartId string `config:"start_id"`
|
||||
Parallelism int `config:"parallelism"`
|
||||
MaxDepth int `config:"max_depth"`
|
||||
}
|
||||
|
||||
var DefaultConfig = Config{
|
||||
Period: 1 * time.Second,
|
||||
StartId: "SmBCZgcGlKk",
|
||||
Parallelism: 5,
|
||||
MaxDepth: 10,
|
||||
}
|
||||
|
|
|
@ -6,6 +6,8 @@ youtubebeat:
|
|||
# Defines how often an event is sent to the output
|
||||
period: 1s
|
||||
start_id: "SmBCZgcGlKk"
|
||||
parallelism: 5
|
||||
max_depth: 10
|
||||
|
||||
output.elasticsearch.index: "youtubebeat"
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@ youtubebeat:
|
|||
# Defines how often an event is sent to the output
|
||||
period: 1s
|
||||
start_id: "SmBCZgcGlKk"
|
||||
parallelism: 5
|
||||
max_depth: 10
|
||||
|
||||
output.elasticsearch.index: "youtubebeat"
|
||||
|
||||
|
|
Loading…
Reference in a new issue