From 1b3b0f4bd848efc6aa63ba932a278c7dd7723b0b Mon Sep 17 00:00:00 2001 From: = <=> Date: Mon, 22 Dec 2025 23:36:29 +0330 Subject: [PATCH] adding locater for the table and looping and extracting data on each link --- .gitignore | 1 + crawlingrrk.py | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 5924335..1c6d7e1 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ htmlcov/ .DS_Store .vscode/ .eggs +log _repo_version.py coverage.xml junit/ diff --git a/crawlingrrk.py b/crawlingrrk.py index 1177ca7..1cf3372 100644 --- a/crawlingrrk.py +++ b/crawlingrrk.py @@ -13,5 +13,22 @@ page.goto( page.locator("#P199_SABTNODATE_AZ").fill("1404/09/20") page.locator("#P199_NEWSPAPERDATE_AZ").fill("1404/10/01") page.locator("#B912476867105247978").click() -valueList = page.locator(".a-GV-table ngh(1) tbody tr") -sleep(10) +print("reached") +sleep(4) +table = page.locator(".a-GV-table").nth(1) +print("table is found") +rows = table.locator("tbody tr") +print("rows found") +rows.first.wait_for() +for i in range(rows.count()): + print(rows.nth(i).inner_html()) + link = "https://rrk.ir" + rows.nth(i).locator("td a").last.get_attribute("href") + newPage = page.context.new_page() + newPage.goto(link) + detailedData = newPage.locator( + "#R41756901674822518 > div.t-Region-bodyWrap > div.t-Region-body > div.container " + ) + with open("htmldocs/" + str(i) + ".html", "w+", encoding="utf-8") as file: + file.write(detailedData.inner_html()) + newPage.close() +sleep(5)