This repository has been archived on 2022-10-02. You can view files and clone it, but cannot push or open issues or pull requests.
dcbot-old/scraper/scraper.go

134 lines
3.4 KiB
Go

package scraper
import (
"fmt"
"net/url"
"strconv"
"strings"
"sync"
"time"
"twdcbot/tribalwars"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
)
const (
pathEnnoblementsLive = "/%s/index.php?page=ennoblements&live=live"
)
type Conquer struct {
Village string
VillageID int
NewOwnerID int
NewOwnerName string
NewOwnerTribeID int
NewOwnerTribeName string
OldOwnerID int
OldOwnerName string
OldOwnerTribeID int
OldOwnerTribeName string
ConqueredAt time.Time
}
type Scraper struct {
worlds []string
since time.Time
collector *colly.Collector
mutex sync.Mutex
result map[string][]*Conquer
}
func New(worlds []string, since time.Time) *Scraper {
s := &Scraper{
since: since,
worlds: worlds,
collector: colly.NewCollector(
colly.Async(true),
),
}
s.collector.Limit(&colly.LimitRule{
RandomDelay: time.Second,
DomainGlob: "*",
Parallelism: 5,
})
return s
}
func (s *Scraper) getIDFromNodeHref(node *goquery.Selection) int {
if node != nil {
nodeHref, ok := node.Attr("href")
if ok {
u, err := url.Parse(nodeHref)
if err == nil {
if idStr := u.Query().Get("id"); idStr != "" {
id, err := strconv.Atoi(idStr)
if err == nil {
return id
}
}
}
}
}
return 0
}
func (s *Scraper) handleHTML(row *colly.HTMLElement) {
world := strings.Split(row.Request.URL.Path, "/")[1]
var err error
c := &Conquer{}
conqueredAtString := strings.TrimSpace(row.DOM.Find("td:last-child").Text())
location := Locations[tribalwars.LanguageCodeFromWorldName(world)]
c.ConqueredAt, err = time.ParseInLocation("2006-01-02 - 15:04:05", conqueredAtString, location)
if err != nil || c.ConqueredAt.Before(s.since.In(location)) {
return
}
villageAnchor := row.DOM.Find("a:first-child").First()
c.VillageID = s.getIDFromNodeHref(villageAnchor)
c.Village = strings.TrimSpace(villageAnchor.Text())
oldOwnerNode := row.DOM.Find("td:nth-child(3) a:first-child")
if len(oldOwnerNode.Nodes) == 0 {
c.OldOwnerName = "-"
c.OldOwnerTribeName = "-"
} else {
c.OldOwnerID = s.getIDFromNodeHref(oldOwnerNode)
c.OldOwnerName = strings.TrimSpace(oldOwnerNode.Text())
oldOwnerTribeNode := row.DOM.Find("td:nth-child(3) .tribelink")
if len(oldOwnerTribeNode.Nodes) != 0 {
c.OldOwnerTribeName = strings.TrimSpace(oldOwnerTribeNode.Text())
c.OldOwnerTribeID = s.getIDFromNodeHref(oldOwnerTribeNode)
} else {
c.OldOwnerTribeName = "-"
}
}
newOwnerNode := row.DOM.Find("td:nth-child(4) a:first-child")
c.NewOwnerID = s.getIDFromNodeHref(newOwnerNode)
c.NewOwnerName = strings.TrimSpace(newOwnerNode.Text())
newOwnerTribeNode := row.DOM.Find("td:nth-child(4) .tribelink")
if len(newOwnerTribeNode.Nodes) != 0 {
c.NewOwnerTribeID = s.getIDFromNodeHref(newOwnerTribeNode)
c.NewOwnerTribeName = strings.TrimSpace(newOwnerTribeNode.Text())
} else {
c.NewOwnerTribeName = "-"
}
s.mutex.Lock()
s.result[world] = append(s.result[world], c)
s.mutex.Unlock()
}
func (s *Scraper) Scrap() map[string][]*Conquer {
s.result = make(map[string][]*Conquer)
s.collector.OnHTML(".r1", s.handleHTML)
s.collector.OnHTML(".r2", s.handleHTML)
for _, world := range s.worlds {
url := TwstatsURLs[tribalwars.LanguageCodeFromWorldName(world)]
if url != "" {
s.collector.Visit(fmt.Sprintf(url+pathEnnoblementsLive, world))
}
}
s.collector.Wait()
return s.result
}