From 78421703ec680c7c871043cf31296ec2b43e9102 Mon Sep 17 00:00:00 2001 From: ghaem Date: Tue, 23 Dec 2025 09:34:57 +0330 Subject: [PATCH] refactor getting the data from the table , waiting for ui loader to load and parsing 1 day at the time because of the limit on the more then the 999 instances of the loaded data --- crawlingrrk.py | 87 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/crawlingrrk.py b/crawlingrrk.py index 1cf3372..47dad8a 100644 --- a/crawlingrrk.py +++ b/crawlingrrk.py @@ -1,34 +1,79 @@ from time import sleep from playwright.sync_api import sync_playwright +import jdatetime +from datetime import timedelta +import sys + +sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +def GetDataFromTheTable(filenamenumber): + for i in range(rows.count()): + # print(rows.nth(i).inner_html()) + link = "https://rrk.ir" + str( + rows.nth(i).locator("td a").last.get_attribute("href") + ) + newPage = page.context.new_page() + newPage.goto(link) + detailedData = newPage.locator( + "#R41756901674822518 > div.t-Region-bodyWrap > div.t-Region-body > div.container " + ) + with open( + "htmldocs/" + str(filenamenumber) + ".html", "w+", encoding="utf-8" + ) as file: + file.write(detailedData.inner_html()) + newPage.close() + filenamenumber += 1 + pw = sync_playwright().start() firefox = pw.firefox.launch(headless=False) context = firefox.new_context(ignore_https_errors=True) page = context.new_page() +inputindate = "1404/9/22" +inputoutdate = "1404/10/2" +def SplitTime(time): + global year, month, day + splitted = time.split("/") + year = int(splitted[0]) + month = int(splitted[1]) + day = int(splitted[2]) + + +SplitTime(inputindate) +start = jdatetime.date(year, month, day) +SplitTime(inputoutdate) +end = jdatetime.date(year, month, day) +current = start +# while current <= end: +# print(current) +# current += timedelta(days=1) + + +datefrom = "1404/9/20" +dateto = "1404/9/24" page.goto( "https://rrk.ir/ords/r/rrs/rrs-front/%D8%AF%D8%A7%D8%AF%D9%87-%D8%A8%D8%A7%D8%B2" ) -page.locator("#P199_SABTNODATE_AZ").fill("1404/09/20") -page.locator("#P199_NEWSPAPERDATE_AZ").fill("1404/10/01") +page.locator("#P199_NEWSPAPERDATE_AZ").fill(datefrom) +page.locator("#P199_SABTNODATE_AZ").fill(datefrom) +page.locator("#P199_NEWSPAPER_TA").fill(dateto) +page.locator("#P199_SABTNODATE_TA").fill(dateto) page.locator("#B912476867105247978").click() -print("reached") -sleep(4) -table = page.locator(".a-GV-table").nth(1) -print("table is found") -rows = table.locator("tbody tr") -print("rows found") -rows.first.wait_for() -for i in range(rows.count()): - print(rows.nth(i).inner_html()) - link = "https://rrk.ir" + rows.nth(i).locator("td a").last.get_attribute("href") - newPage = page.context.new_page() - newPage.goto(link) - detailedData = newPage.locator( - "#R41756901674822518 > div.t-Region-bodyWrap > div.t-Region-body > div.container " - ) - with open("htmldocs/" + str(i) + ".html", "w+", encoding="utf-8") as file: - file.write(detailedData.inner_html()) - newPage.close() -sleep(5) +page.locator(".u-Processing").wait_for(state="attached") +page.locator(".u-Processing").wait_for(state="detached") + +if page.locator(".a-GV-pageRange").inner_text(): + table = page.locator(".a-GV-table").nth(1) + rows = table.locator("tbody tr") + rows.first.wait_for() + filenamenumber = 0 + GetDataFromTheTable(filenamenumber) +else: + print("no data ") + + +def NextPage(): + pass