package beater import ( "fmt" "github.com/gocolly/colly" "strconv" "strings" "time" "github.com/elastic/beats/libbeat/beat" "github.com/elastic/beats/libbeat/common" "github.com/elastic/beats/libbeat/logp" "github.com/Crocmagnon/youtubebeat/config" ) // Youtubebeat configuration. type Youtubebeat struct { done chan struct{} config config.Config client beat.Client } // New creates an instance of youtubebeat. func New(b *beat.Beat, cfg *common.Config) (beat.Beater, error) { c := config.DefaultConfig if err := cfg.Unpack(&c); err != nil { return nil, fmt.Errorf("Error reading config file: %v", err) } bt := &Youtubebeat{ done: make(chan struct{}), config: c, } return bt, nil } const BaseUrl = "https://www.youtube.com" const BaseSuffix = "/watch?v=" func scrapeVideos(b *beat.Beat, bt *Youtubebeat, done chan bool) { videoCollector := colly.NewCollector( colly.AllowedDomains("youtube.com", "www.youtube.com"), colly.Async(true), colly.MaxDepth(bt.config.MaxDepth), ) videoCollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: bt.config.Parallelism}) videoCollector.AllowURLRevisit = true videoCollector.OnHTML("body", func(e *colly.HTMLElement) { url := e.Request.URL.String() isPaid := e.ChildAttr("meta[itemprop=\"paid\"]", "content") if isPaid == "True" { logp.Warn("Not parsing video because of isPaid" + url) return } title := e.ChildAttr("meta[itemprop=\"name\"]", "content") if title == "YouTube" { logp.Warn("Not parsing video because of title " + url) return } views, err := strconv.ParseInt(e.ChildAttr("meta[itemprop=\"interactionCount\"]", "content"), 10, 64) if err != nil { logp.Warn("Can't parse view count for URL " + url) return } date := e.ChildAttr("meta[itemprop=\"datePublished\"]", "content") genre := e.ChildAttr("meta[itemprop=\"genre\"]", "content") fields := common.MapStr{ "url": url, "title": title, "views": views, "date": date, "genre": genre, "type": b.Info.Name, } event := beat.Event{ Timestamp: time.Now(), Fields: fields, } id := e.Request.URL.Query()["v"][0] event.SetID(id) bt.client.Publish(event) logp.Info("Event sent") }) videoCollector.OnHTML("a[href]", func(e *colly.HTMLElement) { href := e.Attr("href") if strings.HasPrefix(href, BaseSuffix) { e.Request.Visit(BaseUrl + href) } }) videoCollector.Visit(bt.config.StartUrl) videoCollector.Wait() logp.Info("Done parsing all videos") done <- true } // Run starts youtubebeat. func (bt *Youtubebeat) Run(b *beat.Beat) error { logp.Info("youtubebeat is running! Hit CTRL-C to stop it.") var err error bt.client, err = b.Publisher.Connect() if err != nil { return err } done := make(chan bool) go scrapeVideos(b, bt, done) select { case <-done: logp.Info("Exiting on done parsing") return nil case <-bt.done: logp.Info("Exiting on beat stop signal") return nil } } // Stop stops youtubebeat. func (bt *Youtubebeat) Stop() { bt.client.Close() close(bt.done) }