Extract parallelism and max depth to config
This commit is contained in:
parent
0ea086dc43
commit
e8eb418e0b
5 changed files with 38 additions and 42 deletions
|
@ -6,6 +6,8 @@ youtubebeat:
|
||||||
# Defines how often an event is sent to the output
|
# Defines how often an event is sent to the output
|
||||||
period: 1s
|
period: 1s
|
||||||
start_id: "SmBCZgcGlKk"
|
start_id: "SmBCZgcGlKk"
|
||||||
|
parallelism: 5
|
||||||
|
max_depth: 10
|
||||||
|
|
||||||
output.elasticsearch.index: "youtubebeat"
|
output.elasticsearch.index: "youtubebeat"
|
||||||
|
|
||||||
|
|
|
@ -38,30 +38,30 @@ func New(b *beat.Beat, cfg *common.Config) (beat.Beater, error) {
|
||||||
const BaseUrl = "https://www.youtube.com"
|
const BaseUrl = "https://www.youtube.com"
|
||||||
const BaseSuffix = "/watch?v="
|
const BaseSuffix = "/watch?v="
|
||||||
|
|
||||||
func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan bool) {
|
func scrapeVideos(b *beat.Beat, bt *Youtubebeat, done chan bool) {
|
||||||
videoCollector := colly.NewCollector(
|
videoCollector := colly.NewCollector(
|
||||||
colly.AllowedDomains("youtube.com", "www.youtube.com"),
|
colly.AllowedDomains("youtube.com", "www.youtube.com"),
|
||||||
colly.Async(true),
|
colly.Async(true),
|
||||||
colly.MaxDepth(10),
|
colly.MaxDepth(bt.config.MaxDepth),
|
||||||
)
|
)
|
||||||
videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 5})
|
videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: bt.config.Parallelism})
|
||||||
videoCollector.AllowURLRevisit = true
|
videoCollector.AllowURLRevisit = true
|
||||||
|
|
||||||
videoCollector.OnHTML("body", func(e *colly.HTMLElement) {
|
videoCollector.OnHTML("body", func(e *colly.HTMLElement) {
|
||||||
url := e.Request.URL.String()
|
url := e.Request.URL.String()
|
||||||
isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content")
|
isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content")
|
||||||
if isPaid == "True" {
|
if isPaid == "True" {
|
||||||
logp.Err("Not parsing video because of isPaid" + url)
|
logp.Warn("Not parsing video because of isPaid" + url)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
title := e.ChildAttr("meta[itemprop=\"name\"]", "content")
|
title := e.ChildAttr("meta[itemprop=\"name\"]", "content")
|
||||||
if title == "YouTube" {
|
if title == "YouTube" {
|
||||||
logp.Err("Not parsing video because of title " + url)
|
logp.Warn("Not parsing video because of title " + url)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64)
|
views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logp.Err("Can't parse view count for URL " + url)
|
logp.Warn("Can't parse view count for URL " + url)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content")
|
date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content")
|
||||||
|
@ -70,8 +70,18 @@ func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan boo
|
||||||
"title": title,
|
"title": title,
|
||||||
"views": views,
|
"views": views,
|
||||||
"date": date,
|
"date": date,
|
||||||
|
"type": b.Info.Name,
|
||||||
}
|
}
|
||||||
fieldsToSend <- fields
|
event := beat.Event{
|
||||||
|
Timestamp: time.Now(),
|
||||||
|
Fields: fields,
|
||||||
|
}
|
||||||
|
|
||||||
|
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
|
||||||
|
event.SetID(id)
|
||||||
|
|
||||||
|
bt.client.Publish(event)
|
||||||
|
logp.Info("Event sent")
|
||||||
})
|
})
|
||||||
|
|
||||||
videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||||||
|
@ -81,7 +91,7 @@ func scrapeVideos(startId string, fieldsToSend chan common.MapStr, done chan boo
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
videoCollector.Visit(BaseUrl + BaseSuffix + startId)
|
videoCollector.Visit(BaseUrl + BaseSuffix + bt.config.StartId)
|
||||||
videoCollector.Wait()
|
videoCollector.Wait()
|
||||||
done <- true
|
done <- true
|
||||||
}
|
}
|
||||||
|
@ -96,38 +106,14 @@ func (bt *Youtubebeat) Run(b *beat.Beat) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
fieldsToSend := make(chan common.MapStr)
|
|
||||||
done := make(chan bool)
|
done := make(chan bool)
|
||||||
go scrapeVideos(bt.config.StartId, fieldsToSend, done)
|
go scrapeVideos(b, bt, done)
|
||||||
|
|
||||||
ticker := time.NewTicker(bt.config.Period)
|
|
||||||
for {
|
|
||||||
select {
|
select {
|
||||||
case <-done:
|
case <-done:
|
||||||
return nil
|
return nil
|
||||||
case <-bt.done:
|
case <-bt.done:
|
||||||
return nil
|
return nil
|
||||||
case <-ticker.C:
|
|
||||||
}
|
|
||||||
// Handle a SIGINT even when no more videos to fetch
|
|
||||||
select {
|
|
||||||
case <-done:
|
|
||||||
return nil
|
|
||||||
case <-bt.done:
|
|
||||||
return nil
|
|
||||||
case fields := <-fieldsToSend:
|
|
||||||
fields["type"] = b.Info.Name
|
|
||||||
event := beat.Event{
|
|
||||||
Timestamp: time.Now(),
|
|
||||||
Fields: fields,
|
|
||||||
}
|
|
||||||
|
|
||||||
id := strings.Replace(fields["url"].(string), BaseUrl+BaseSuffix, "", -1)
|
|
||||||
event.SetID(id)
|
|
||||||
|
|
||||||
bt.client.Publish(event)
|
|
||||||
logp.Info("Event sent")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,13 @@ import "time"
|
||||||
type Config struct {
|
type Config struct {
|
||||||
Period time.Duration `config:"period"`
|
Period time.Duration `config:"period"`
|
||||||
StartId string `config:"start_id"`
|
StartId string `config:"start_id"`
|
||||||
|
Parallelism int `config:"parallelism"`
|
||||||
|
MaxDepth int `config:"max_depth"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var DefaultConfig = Config{
|
var DefaultConfig = Config{
|
||||||
Period: 1 * time.Second,
|
Period: 1 * time.Second,
|
||||||
StartId: "SmBCZgcGlKk",
|
StartId: "SmBCZgcGlKk",
|
||||||
|
Parallelism: 5,
|
||||||
|
MaxDepth: 10,
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,8 @@ youtubebeat:
|
||||||
# Defines how often an event is sent to the output
|
# Defines how often an event is sent to the output
|
||||||
period: 1s
|
period: 1s
|
||||||
start_id: "SmBCZgcGlKk"
|
start_id: "SmBCZgcGlKk"
|
||||||
|
parallelism: 5
|
||||||
|
max_depth: 10
|
||||||
|
|
||||||
output.elasticsearch.index: "youtubebeat"
|
output.elasticsearch.index: "youtubebeat"
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,8 @@ youtubebeat:
|
||||||
# Defines how often an event is sent to the output
|
# Defines how often an event is sent to the output
|
||||||
period: 1s
|
period: 1s
|
||||||
start_id: "SmBCZgcGlKk"
|
start_id: "SmBCZgcGlKk"
|
||||||
|
parallelism: 5
|
||||||
|
max_depth: 10
|
||||||
|
|
||||||
output.elasticsearch.index: "youtubebeat"
|
output.elasticsearch.index: "youtubebeat"
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue