From 1fd0be907d5b07e3fc13168358bec4c577ce9a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anton=20Luka=20=C5=A0ijanec?= Date: Sun, 6 Nov 2022 14:07:24 +0100 Subject: fix if server returns 500 for some reason meaning acsm does not exist --- gather.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/gather.py b/gather.py index 1948d8e..2507311 100755 --- a/gather.py +++ b/gather.py @@ -14,9 +14,10 @@ try: except ModuleNotFoundError: raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4") -if len(argv) != 1+1: - raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db") +if len(argv) != 1+2: + raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address") +operator_contact = argv[2] engine = create_engine(argv[1], echo=True, future=True) Base = declarative_base() @@ -53,6 +54,7 @@ logger.debug("welcome to %s", argv[0]) Base.metadata.create_all(engine) starting_acsm_id = 177238 +guaranteed_large_acsm_id = 1170487 logger.debug(f"created metadata.") force_acsm_id = 0 @@ -60,6 +62,8 @@ force_acsm_id = 0 valid_acsms = 0 only_isbn_acsms = 0 failed_acsms = 0 +failed_acsms_not200 = 0 +failed_acsms_not200_in_a_row = 0 try: with Session(engine) as session: @@ -75,10 +79,19 @@ try: else: logger.info(f"continuing from latest {borrow}") acsm_id = borrow.id+1 - r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm") + r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm", headers={"User-Agent": f"python-requests/{requests.__version__} (biblos-stat acsm scraper, contact operator: {operator_contact})"}) + if (r.status_code == 200): + failed_acsms_not200_in_a_row = 0 if r.status_code != 200: - logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}. latest borrow is {borrow}") - break + logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {10-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.") + failed_acsms_not200 += 1 + failed_acsms_not200_in_a_row += 1 + force_acsm_id = acsm_id+1 + if failed_acsms_not200_in_a_row == 10: + logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}, which means 10 concurrent responses that are not 200.") + if acsm_id < guaranteed_large_acsm_id: + logger.error(f"this shouldn't happen. I have a hardcoded value that tells me that at time of program writing, acsm id {guaranteed_large_acsm_id} did exist on the server. dying anyways.") + break elif r.text.startswith('