Skip to content

Commit

Permalink
added exil + small additional func
Browse files Browse the repository at this point in the history
  • Loading branch information
jakopako committed Jan 20, 2022
1 parent 4cf8a5e commit 0cba486
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 16 deletions.
24 changes: 24 additions & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -396,4 +396,28 @@ crawlers:
location: "Europe/Berlin"
language: "de_DE"

- name: Exil
type: concert
city: Zurich
url: "https://exil.cl/programm"
event: "#detail-main article"
exclude: ".wct_event"
fields:
title:
loc: ".grid-event-title"
comment:
loc: ".hyphenate p"
max_length: 200
date:
day_month_year_time:
loc: ".grid-event-date"
layout: "2006-01-02T15:04"
attr: "datetime"
location: "Europe/Berlin"
language: "de_DE"
filters:
- field: "title"
regex_ignore: ".*abgesagt.*"
- field: "title"
regex_ignore: "geschlossene Gesellschaft"

1 change: 1 addition & 0 deletions example-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ crawlers:
city: Munich # city of the location.
url: "https://backstage.info/veranstaltungen-2/alle-veranstaltungen" # url of the page that contains the events.
event: ".event" # selector for an event.
exclude: ".exclude" # exclude events that match this selector.
fields: # a dictionary of fields. Only those listed below exist but not all are mandatory
title: # Mandatory field.
loc: ".pos-title a" # selector for the title
Expand Down
38 changes: 22 additions & 16 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func (c Crawler) getEvents() ([]Event, error) {
}

doc.Find(c.Event).Each(func(i int, s *goquery.Selection) {
if s.Find(c.Exclude).Length() > 0 {
if s.Find(c.Exclude).Length() > 0 || s.Is(c.Exclude) {
return
}

Expand Down Expand Up @@ -296,19 +296,22 @@ func getDateStringAndLayout(dl *DateField, s *goquery.Selection) (string, string
// TODO: Add possibility to apply a regex across s.Find(dl.Loc).Text()
// A bit hacky..
if len(fieldStringSelection.Nodes) > 0 {
fieldStringNode := fieldStringSelection.Get(dl.NodeIndex).FirstChild
for fieldStringNode != nil {
if fieldStringNode.Type == html.TextNode {
// we 'abuse' the extractStringRegex func to find the correct text element.
var err error
fieldString, err = extractStringRegex(&dl.RegexExtract, fieldStringNode.Data)
if err == nil {
break
if dl.Attr == "" {
fieldStringNode := fieldStringSelection.Get(dl.NodeIndex).FirstChild
for fieldStringNode != nil {
if fieldStringNode.Type == html.TextNode {
// we 'abuse' the extractStringRegex func to find the correct text element.
var err error
fieldString, err = extractStringRegex(&dl.RegexExtract, fieldStringNode.Data)
if err == nil {
break
}
}
fieldStringNode = fieldStringNode.NextSibling
}
fieldStringNode = fieldStringNode.NextSibling
} else {
fieldString = fieldStringSelection.AttrOr(dl.Attr, "")
}
// fieldString = extractStringRegex(&dl.Regex, fieldString)
}
fieldLayout = dl.Layout
return fieldString, fieldLayout
Expand All @@ -319,18 +322,20 @@ func getFieldString(f *Field, s *goquery.Selection) string {
fieldSelection := s.Find(f.Loc)
if len(fieldSelection.Nodes) > f.NodeIndex {
fieldNode := fieldSelection.Get(f.NodeIndex).FirstChild
if fieldNode.Type == html.TextNode {
fieldString = fieldSelection.Get(f.NodeIndex).FirstChild.Data
if f.MaxLength > 0 && f.MaxLength < len(fieldString) {
return fieldString[:f.MaxLength] + "..."
if fieldNode != nil {
if fieldNode.Type == html.TextNode {
fieldString = strings.TrimSpace(fieldSelection.Get(f.NodeIndex).FirstChild.Data)
if f.MaxLength > 0 && f.MaxLength < len(fieldString) {
fieldString = fieldString[:f.MaxLength] + "..."
}
}
}
}
fieldString, err := extractStringRegex(&f.RegexExtract, fieldString)
if err != nil {
log.Fatal(err)
}
return strings.TrimSpace(fieldString)
return fieldString
}

func extractStringRegex(rc *RegexConfig, s string) (string, error) {
Expand Down Expand Up @@ -444,6 +449,7 @@ type DateField struct {
Layout string `yaml:"layout"`
NodeIndex int `yaml:"node_index"`
RegexExtract RegexConfig `yaml:"regex_extract"`
Attr string `yaml:"attr"`
}

type Field struct {
Expand Down

0 comments on commit 0cba486

Please sign in to comment.