refactor getting the data from the table , waiting for ui loader to load and parsing 1 day at the time because of the limit on the more then the 999 instances of the loaded data

2025-12-23 09:34:57 +03:30
parent 1b3b0f4bd8
commit 78421703ec
1 changed files with 66 additions and 21 deletions
--- a/crawlingrrk.py
+++ b/crawlingrrk.py
@@ -1,34 +1,79 @@
 from time import sleep
 from playwright.sync_api import sync_playwright
 import jdatetime
 from datetime import timedelta
 import sys
 sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
 def GetDataFromTheTable(filenamenumber):
    for i in range(rows.count()):
        # print(rows.nth(i).inner_html())
        link = "https://rrk.ir" + str(
            rows.nth(i).locator("td a").last.get_attribute("href")
        )
        newPage = page.context.new_page()
        newPage.goto(link)
        detailedData = newPage.locator(
            "#R41756901674822518 > div.t-Region-bodyWrap > div.t-Region-body > div.container "
        )
        with open(
            "htmldocs/" + str(filenamenumber) + ".html", "w+", encoding="utf-8"
        ) as file:
            file.write(detailedData.inner_html())
        newPage.close()
        filenamenumber += 1
 pw = sync_playwright().start()
 firefox = pw.firefox.launch(headless=False)
 context = firefox.new_context(ignore_https_errors=True)
 page = context.new_page()
 inputindate = "1404/9/22"
 inputoutdate = "1404/10/2"
 def SplitTime(time):
    global year, month, day
    splitted = time.split("/")
    year = int(splitted[0])
    month = int(splitted[1])
    day = int(splitted[2])
 SplitTime(inputindate)
 start = jdatetime.date(year, month, day)
 SplitTime(inputoutdate)
 end = jdatetime.date(year, month, day)
 current = start
 # while current <= end:
 #     print(current)
 #     current += timedelta(days=1)
 datefrom = "1404/9/20"
 dateto = "1404/9/24"
 page.goto(
    "https://rrk.ir/ords/r/rrs/rrs-front/%D8%AF%D8%A7%D8%AF%D9%87-%D8%A8%D8%A7%D8%B2"
 )
-page.locator("#P199_SABTNODATE_AZ").fill("1404/09/20")
+page.locator("#P199_NEWSPAPERDATE_AZ").fill(datefrom)
-page.locator("#P199_NEWSPAPERDATE_AZ").fill("1404/10/01")
+page.locator("#P199_SABTNODATE_AZ").fill(datefrom)
 page.locator("#P199_NEWSPAPER_TA").fill(dateto)
 page.locator("#P199_SABTNODATE_TA").fill(dateto)
 page.locator("#B912476867105247978").click()
-print("reached")
+page.locator(".u-Processing").wait_for(state="attached")
-sleep(4)
+page.locator(".u-Processing").wait_for(state="detached")
-table = page.locator(".a-GV-table").nth(1)
+
-print("table is found")
+if page.locator(".a-GV-pageRange").inner_text():
-rows = table.locator("tbody tr")
+    table = page.locator(".a-GV-table").nth(1)
-print("rows found")
+    rows = table.locator("tbody tr")
-rows.first.wait_for()
+    rows.first.wait_for()
-for i in range(rows.count()):
+    filenamenumber = 0
-    print(rows.nth(i).inner_html())
+    GetDataFromTheTable(filenamenumber)
-    link = "https://rrk.ir" + rows.nth(i).locator("td a").last.get_attribute("href")
+else:
-    newPage = page.context.new_page()
+    print("no data ")
-    newPage.goto(link)
+
-    detailedData = newPage.locator(
+
-        "#R41756901674822518 > div.t-Region-bodyWrap > div.t-Region-body > div.container "
+def NextPage():
-    )
+    pass
    with open("htmldocs/" + str(i) + ".html", "w+", encoding="utf-8") as file:
        file.write(detailedData.inner_html())
    newPage.close()
 sleep(5)