From 728186da2bb0b2534f0f8929c7381374b051ebe7 Mon Sep 17 00:00:00 2001 From: jakopako Date: Mon, 24 Jan 2022 13:52:39 +0100 Subject: [PATCH 1/2] wip --- config.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/config.yml b/config.yml index ceaa3b6..35c5b7a 100644 --- a/config.yml +++ b/config.yml @@ -450,3 +450,25 @@ crawlers: exp: "[0-9]{2}(\\.|:)[0-9]{2}" index: -1 location: "Europe/Berlin" + + - name: Muffatwerk + type: concert + city: Munich + url: "https://www.muffatwerk.de/de/events/concert" + event: ".row .event" + fields: + title: + loc: ".hover-in .center span" + url: + loc: ".hover-in .right a" + relative: true + date: + day_month: + loc: ".hover-in .center" + layout: "02.01." + regex_extract: + exp: "([0-9]{2}\\.){2}" + year: + loc: ".date br" # why does this find the correct year? + layout: "06" + \ No newline at end of file From 59f0b6a0f5a58f420332b7e6e2bfeb1b74a6bc2b Mon Sep 17 00:00:00 2001 From: jakopako Date: Sat, 29 Jan 2022 23:49:35 +0100 Subject: [PATCH 2/2] Add Muffatwerk Fixes #29 --- config.yml | 16 ++++++++++++---- main.go | 23 ++++++++++++++++++----- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/config.yml b/config.yml index 35c5b7a..d672015 100644 --- a/config.yml +++ b/config.yml @@ -465,10 +465,18 @@ crawlers: date: day_month: loc: ".hover-in .center" - layout: "02.01." + layout: "02.01. " regex_extract: - exp: "([0-9]{2}\\.){2}" + exp: "([0-9]{2}\\.){2}\\s" year: - loc: ".date br" # why does this find the correct year? - layout: "06" + loc: ".date" + layout: "06 " # the whitespace in this string is a tab. + regex_extract: + exp: "[0-9]{2}\\s" + time: + loc: ".hover-in .center" + layout: "15:04" + regex_extract: + exp: "[0-9]{2}:[0-9]{2}" + location: "Europe/Berlin" \ No newline at end of file diff --git a/main.go b/main.go index 9724e68..b44cc8e 100644 --- a/main.go +++ b/main.go @@ -12,6 +12,7 @@ import ( "os" "regexp" "sort" + "strconv" "strings" "sync" "time" @@ -212,7 +213,16 @@ func (c Crawler) ignoreEvent(event *Event) (bool, error) { func extractField(item string, s *goquery.Selection, crawler *Crawler, event *Event, events []Event, loc *time.Location, mLocale string, res *http.Response) error { switch item { case "date": - year := time.Now().Year() + currentYear := time.Now().Year() + yearString := strconv.Itoa(currentYear) + yearLayout := "2006" + + if crawler.Fields.Date.Year.Loc != "" { + yearStringTmp, yearLayoutTmp := getDateStringAndLayout(&crawler.Fields.Date.Year, s) + if yearStringTmp != "" { + yearString, yearLayout = yearStringTmp, yearLayoutTmp + } + } var timeString, timeStringLayout string if crawler.Fields.Date.Time.Loc == "" { @@ -240,8 +250,8 @@ func extractField(item string, s *goquery.Selection, crawler *Crawler, event *Ev dayMonthLayout = dayLayout + " " + monthLayout } - dateTimeLayout = fmt.Sprintf("%s 2006 %s", dayMonthLayout, timeStringLayout) - dateTimeString = fmt.Sprintf("%s %d %s", dayMonthString, year, timeString) + dateTimeLayout = fmt.Sprintf("%s %s %s", dayMonthLayout, yearLayout, timeStringLayout) + dateTimeString = fmt.Sprintf("%s %s %s", dayMonthString, yearString, timeString) } if dateTimeString == "" { @@ -255,8 +265,10 @@ func extractField(item string, s *goquery.Selection, crawler *Crawler, event *Ev // actually this is only necessary if we have to guess the date but currently for ease of implementation // this check is done always. if len(events) > 0 { - if events[len(events)-1].Date.After(t) { - t = time.Date(int(year+1), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) + correctYear := currentYear + for events[len(events)-1].Date.After(t) { + correctYear += 1 + t = time.Date(int(correctYear), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), t.Location()) } } event.Date = t @@ -486,6 +498,7 @@ type Crawler struct { Date struct { Day DateField `yaml:"day"` Month DateField `yaml:"month"` + Year DateField `yaml:"year"` DayMonth DateField `yaml:"day_month"` DayMonthYear DateField `yaml:"day_month_year"` DayMonthYearTime DateField `yaml:"day_month_year_time"`