From 797dd0eeaf96df1bf0fb0a142db7a5360512e861 Mon Sep 17 00:00:00 2001 From: Gabriel Augendre Date: Tue, 20 Nov 2018 14:48:22 +0100 Subject: [PATCH] Use a start url --- _meta/beat.yml | 1 + beater/youtubebeat.go | 2 +- config/config.go | 6 ++++-- youtubebeat.reference.yml | 1 + youtubebeat.yml | 1 + 5 files changed, 8 insertions(+), 3 deletions(-) diff --git a/_meta/beat.yml b/_meta/beat.yml index a6a7019..e55b78f 100644 --- a/_meta/beat.yml +++ b/_meta/beat.yml @@ -6,6 +6,7 @@ youtubebeat: # Defines how often an event is sent to the output parallelism: 5 max_depth: 10 + start_url: "https://www.youtube.com" output.elasticsearch.index: "youtubebeat" diff --git a/beater/youtubebeat.go b/beater/youtubebeat.go index 78388b4..d657969 100644 --- a/beater/youtubebeat.go +++ b/beater/youtubebeat.go @@ -92,7 +92,7 @@ func scrapeVideos(b *beat.Beat, bt *Youtubebeat, done chan bool) { } }) - videoCollector.Visit(BaseUrl) + videoCollector.Visit(bt.config.StartUrl) videoCollector.Wait() logp.Info("Done parsing all videos") done <- true diff --git a/config/config.go b/config/config.go index b5df0d4..74b541a 100644 --- a/config/config.go +++ b/config/config.go @@ -4,11 +4,13 @@ package config type Config struct { - Parallelism int `config:"parallelism"` - MaxDepth int `config:"max_depth"` + Parallelism int `config:"parallelism"` + MaxDepth int `config:"max_depth"` + StartUrl string `config:"start_url"` } var DefaultConfig = Config{ Parallelism: 5, MaxDepth: 10, + StartUrl: "https://www.youtube.com", } diff --git a/youtubebeat.reference.yml b/youtubebeat.reference.yml index 16108af..5337837 100644 --- a/youtubebeat.reference.yml +++ b/youtubebeat.reference.yml @@ -6,6 +6,7 @@ youtubebeat: # Defines how often an event is sent to the output parallelism: 5 max_depth: 10 + start_url: "https://www.youtube.com" output.elasticsearch.index: "youtubebeat" diff --git a/youtubebeat.yml b/youtubebeat.yml index e0a5cd7..3e98c2c 100644 --- a/youtubebeat.yml +++ b/youtubebeat.yml @@ -6,6 +6,7 @@ youtubebeat: # Defines how often an event is sent to the output parallelism: 5 max_depth: 10 + start_url: "https://www.youtube.com" output.elasticsearch.index: "youtubebeat"