refactor getting the data from the table , waiting for ui loader to load and parsing 1 day at the time because of the limit on the more then the 999 instances of the loaded data

2025-12-23 09:34:57 +03:30
parent 1b3b0f4bd8
commit 78421703ec
1 changed files with 66 additions and 21 deletions
--- a/crawlingrrk.py
+++ b/crawlingrrk.py
@@ -1,34 +1,79 @@
 from time import sleep
 from playwright.sync_api import sync_playwright
+import jdatetime
+from datetime import timedelta
+import sys
+
+sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+def GetDataFromTheTable(filenamenumber):
+    for i in range(rows.count()):
+        # print(rows.nth(i).inner_html())
+        link = "https://rrk.ir" + str(
+            rows.nth(i).locator("td a").last.get_attribute("href")
+        )
+        newPage = page.context.new_page()
+        newPage.goto(link)
+        detailedData = newPage.locator(
+            "#R41756901674822518 > div.t-Region-bodyWrap > div.t-Region-body > div.container "
+        )
+        with open(
+            "htmldocs/" + str(filenamenumber) + ".html", "w+", encoding="utf-8"
+        ) as file:
+            file.write(detailedData.inner_html())
+        newPage.close()
+        filenamenumber += 1
+

 pw = sync_playwright().start()
 firefox = pw.firefox.launch(headless=False)
 context = firefox.new_context(ignore_https_errors=True)
 page = context.new_page()
+inputindate = "1404/9/22"
+inputoutdate = "1404/10/2"


+def SplitTime(time):
+    global year, month, day
+    splitted = time.split("/")
+    year = int(splitted[0])
+    month = int(splitted[1])
+    day = int(splitted[2])
+
+
+SplitTime(inputindate)
+start = jdatetime.date(year, month, day)
+SplitTime(inputoutdate)
+end = jdatetime.date(year, month, day)
+current = start
+# while current <= end:
+#     print(current)
+#     current += timedelta(days=1)
+
+
+datefrom = "1404/9/20"
+dateto = "1404/9/24"
 page.goto(
    "https://rrk.ir/ords/r/rrs/rrs-front/%D8%AF%D8%A7%D8%AF%D9%87-%D8%A8%D8%A7%D8%B2"
 )
-page.locator("#P199_SABTNODATE_AZ").fill("1404/09/20")
-page.locator("#P199_NEWSPAPERDATE_AZ").fill("1404/10/01")
+page.locator("#P199_NEWSPAPERDATE_AZ").fill(datefrom)
+page.locator("#P199_SABTNODATE_AZ").fill(datefrom)
+page.locator("#P199_NEWSPAPER_TA").fill(dateto)
+page.locator("#P199_SABTNODATE_TA").fill(dateto)
 page.locator("#B912476867105247978").click()
-print("reached")
-sleep(4)
-table = page.locator(".a-GV-table").nth(1)
-print("table is found")
-rows = table.locator("tbody tr")
-print("rows found")
-rows.first.wait_for()
-for i in range(rows.count()):
-    print(rows.nth(i).inner_html())
-    link = "https://rrk.ir" + rows.nth(i).locator("td a").last.get_attribute("href")
-    newPage = page.context.new_page()
-    newPage.goto(link)
-    detailedData = newPage.locator(
-        "#R41756901674822518 > div.t-Region-bodyWrap > div.t-Region-body > div.container "
-    )
-    with open("htmldocs/" + str(i) + ".html", "w+", encoding="utf-8") as file:
-        file.write(detailedData.inner_html())
-    newPage.close()
-sleep(5)
+page.locator(".u-Processing").wait_for(state="attached")
+page.locator(".u-Processing").wait_for(state="detached")
+
+if page.locator(".a-GV-pageRange").inner_text():
+    table = page.locator(".a-GV-table").nth(1)
+    rows = table.locator("tbody tr")
+    rows.first.wait_for()
+    filenamenumber = 0
+    GetDataFromTheTable(filenamenumber)
+else:
+    print("no data ")
+
+
+def NextPage():
+    pass