Skip to content

Commit

Permalink
added ability to scrape local .html files
Browse files Browse the repository at this point in the history
  • Loading branch information
Malvere committed Oct 19, 2023
1 parent 8d1b5e1 commit 64595c4
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 6 deletions.
Binary file modified .DS_Store
Binary file not shown.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ sber-scrape-win-x86.exe
.github/workflows/go.yml
.DS_Store
/bin*
sber.csv
sber.csv
.html
page.html
6 changes: 4 additions & 2 deletions cmd/sber-scrape/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@ import (

var (
configPath string
urlFlag string
mode string
searchFlag string
urlFlag string
parseURL string
)

// ConfigFile
func init() {
flag.StringVar(&configPath, "config-path", "config/config.toml", "path to config file")
flag.StringVar(&mode, "m", "web", "mode to run in. <web> makes HTTP requests, while <local> searches for .html file")
flag.StringVar(&searchFlag, "s", "", "search")
flag.StringVar(&urlFlag, "u", "", "parse url")
}
Expand Down Expand Up @@ -51,7 +53,7 @@ func main() {
}

// Send a GET request to the URL and parse
if err := sparser.Start(config, parseURL); err != nil {
if err := sparser.Start(config, parseURL, mode); err != nil {
log.Fatal(err)
}
}
14 changes: 11 additions & 3 deletions internal/sparser/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,24 @@ import (
"sber-scrape/internal/store/sqlstore"
)

func Start(config *Config, url string) error {
func Start(config *Config, url string, mode string) error {
store, db, err := chooseStore(config)
if err != nil {
log.Fatal(err)
return err
}
defer db.Close()
if err := GetHtml(url, store); err != nil {
log.Fatal(err)
switch mode {
case "web":
if err := GetHtml(url, store); err != nil {
log.Fatal(err)
}
case "local":
if err := GetLocalHtml("page.html", store); err != nil {
log.Fatal(err)
}
}

if store, ok := store.(*litestore.Store); ok {
log.Print("Preparing .csv file...")
CommaSeparated("sber", store)
Expand Down
70 changes: 70 additions & 0 deletions internal/sparser/sparser.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"log"
"net/http"
"os"
"regexp"
"sber-scrape/internal/model"
"sber-scrape/internal/store"
Expand Down Expand Up @@ -93,3 +94,72 @@ func GetHtml(url string, store store.Store) error {
})
return nil
}

func GetLocalHtml(filename string, store store.Store) error {

file, err := os.Open(filename)
if err != nil {
log.Fatal(err)
}
doc, err := goquery.NewDocumentFromReader(file)
if err != nil {
log.Fatal(err)
}

doc.Find(".item-block").Each(func(index int, itemBlock *goquery.Selection) {
// Helper function to extract text from an element and trim whitespace
getText := func(selector string) string {
return strings.TrimSpace(
strings.ReplaceAll(
itemBlock.Find(selector).Text(),
"\t",
"",
),
)
}
convInt := func(str string) (int, error) {
if str != "" {
regex := regexp.MustCompile("[^0-9]+")
result := regex.ReplaceAllString(str, "")
return strconv.Atoi(result)
} else {
return 0, nil
}
}
itemTitle := getText(".item-title")
itemPrice, _ := convInt(getText(".item-price"))
bonusAmount, _ := convInt(getText(".bonus-amount"))
bonusPercent, _ := convInt(getText(".bonus-percent"))
discount, _ := convInt(getText(".discount-percentage__value"))

// Extract productID and link attributes
productIDText, _ := itemBlock.Find(".ddl_product_link").Attr("data-product-id")
productID, _ := convInt(productIDText)
val, _ := itemBlock.Find(".ddl_product_link").Attr("href")
link := fmt.Sprintf("%s%s", "https://megamarket.ru", val)

p := &model.Product{
Title: itemTitle,
Price: itemPrice,
BonusAmount: bonusAmount,
BonusPercent: bonusPercent,
Discount: discount,
ProductID: productID,
Link: link,
}
if err := store.Product().Create(p); err != nil {
log.Fatal(err)
return
}
// Print the extracted data
fmt.Println("Title: ", itemTitle)
fmt.Println("Price: ", itemPrice)
fmt.Println("SBonuses: ", bonusAmount)
fmt.Println("SBonuses %: ", bonusPercent)
fmt.Println("Discount: ", discount)
fmt.Println("Product ID", productID)
fmt.Println("URL: ", link)
// fmt.Println("-" * 10)
})
return nil
}

0 comments on commit 64595c4

Please sign in to comment.