path: root/
diff options
authorAnton Luka Šijanec <>2022-11-15 18:35:22 +0100
committerAnton Luka Šijanec <>2022-11-15 18:35:22 +0100
commit7b2b16af1bc952d6f283a72bebf7becacedbd748 (patch)
tree0f16f363ca1d25994cf288a062b4540ae802494c /
parentfixed scoping (diff)
Diffstat (limited to '')
1 files changed, 193 insertions, 0 deletions
diff --git a/ b/
new file mode 100755
index 0000000..198eff9
--- /dev/null
+++ b/
@@ -0,0 +1,193 @@
+from sys import argv
+import logging
+from time import localtime, mktime, time
+import requests
+from base64 import b64decode
+from datetime import datetime, timedelta, timezone
+ from sqlalchemy import Table, MetaData, Integer, BigInteger, String, Column, Table, ForeignKey, create_engine, select, DateTime
+ from sqlalchemy.orm import declarative_base, relationship, Session
+except ModuleNotFoundError:
+ raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy")
+ from bs4 import BeautifulSoup, FeatureNotFound
+except ModuleNotFoundError:
+ raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4")
+operator_contact = argv[2]
+Base = declarative_base()
+class Book(Base):
+ __tablename__ = "books"
+ isbn = Column(BigInteger, primary_key=True, nullable=False, doc="book isbn. found in URL http://www/isbn/978 and in acsm: resource, dc:identifier (sometimes not), thumbnailURL")
+ title = Column(String, nullable=True, doc="title of the book, dcc:title in acsm")
+ creator = Column(String, nullable=True, doc="author of the book, dc:creator in acsm")
+ publisher = Column(String, nullable=True, doc="publisher of the book, dc:publisher in acsm")
+ identifier = Column(String, nullable=True, doc="if dc:identifier can't be derived from isbn, it's stored here. if dc:identifier element is missing, a literal string noidentifier is stored.")
+ thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png, may be None if there's no thumbnailURL element")
+ format = Column(String, nullable=True, doc="format of the file. I've seen application/pdf and application/epub+zip")
+ language = Column(String, nullable=True, doc="language of the book. I've seen sl.")
+ borrows = relationship("Borrow", back_populates="book");
+ def __repr__(self):
+ return f"Book(isbn={self.isbn!r}, title={self.title!r}, creator={self.creator!r}, publisher={self.publisher!r})"
+class Borrow(Base):
+ __tablename__ = "borrows"
+ id = Column(Integer, primary_key=True, nullable=False, doc="id in transaction element of acsm or in filename of acsm on http")
+ isbn = Column(ForeignKey("books.isbn"), nullable=False, doc="foreign key that leads to a book")
+ transaction = Column(String, nullable=True, doc="transaction element content, but only if it couldn't be derived from format ACS-BIBL-L-{acsm_id}, otherwise Null")
+ purchase_utc = Column(DateTime, nullable=True, doc="acsm purchase element excluding timezone in UTC")
+ expiration_utc = Column(DateTime, nullable=True, doc="acsm expiration element excluding timezone in UTC")
+ purchase_timezone = Column(Integer, nullable=True, doc="acsm purchase element timezone offset from UTC in seconds (note that purchase is UTC)")
+ expiration_timezone = Column(Integer, nullable=True, doc="acsm expiration element timezone offset from UTC in seconds (note that expiration is UTC)")
+ obtained = Column(BigInteger, nullable=False, doc="UNIX timestamp when this borrow was obtained as acsm from http")
+ duration = Column(Integer, nullable=True, doc="duration in seconds that a DRM client may make the book available")
+ book = relationship("Book", back_populates="borrows")
+ def __repr__(self):
+ return f"Borrow(id={!r}, isbn={self.isbn!r}, purchase={self.purchase_utc!r}, purchase_timezone={self.purchase_timezone!r} expiration={self.expiration_utc!r}, expiration_timezone={self.expiration_timezone!r}, obtained=mktime({localtime(self.obtained)!r}), duration={self.duration!r}, book={!r})"
+logger = logging.getLogger(argv[0])
+logger.debug("welcome to %s", argv[0])
+starting_acsm_id = 177238
+guaranteed_large_acsm_id = 1170487
+def update(engine, hmfan2iarts=100):
+ force_acsm_id = 0
+ valid_acsms = 0
+ only_isbn_acsms = 0
+ failed_acsms = 0
+ failed_acsms_not200 = 0
+ failed_acsms_not200_in_a_row = 0
+ with Session(engine) as session:
+ while True:
+ if force_acsm_id != 0:
+ acsm_id = force_acsm_id
+ force_acsm_id = 0
+ else:
+ borrow = session.scalars(select(Borrow).order_by(
+ acsm_id = starting_acsm_id
+ if borrow is None:
+"oooh, it looks like this is a fresh start, db contains no borrows. I'll start with hardcoded acsm id {starting_acsm_id}")
+ else:
+"continuing from latest {borrow}")
+ acsm_id =
+ r = requests.get(f"{acsm_id}.acsm", headers={"User-Agent": f"python-requests/{requests.__version__} (biblos-stat acsm scraper, contact operator: {operator_contact})"})
+ r.encoding = "UTF-8"
+ if (r.status_code == 200):
+ failed_acsms_not200_in_a_row = 0
+ if r.status_code != 200:
+ if borrow.purchase_utc > - timedelta(hours=1):
+"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id} and the last requested acsm was created less than an hour ago")
+ break
+ logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {hmfan2iarts-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.")
+ failed_acsms_not200 += 1
+ failed_acsms_not200_in_a_row += 1
+ force_acsm_id = acsm_id+1
+ if failed_acsms_not200_in_a_row == hmfan2iarts:
+"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}, which means {hmfan2iarts} concurrent responses that are not 200.")
+ if acsm_id < guaranteed_large_acsm_id:
+ logger.error(f"this shouldn't happen. I have a hardcoded value that tells me that at time of program writing, acsm id {guaranteed_large_acsm_id} did exist on the server. dying anyways.")
+ break
+ elif r.text.startswith("Napaka pri prenosu"):
+ logger.warning(f"'napaka pri prenosu' received from http for acsm id {acsm_id}, skipping")
+ force_acsm_id = acsm_id+1
+ elif r.text.startswith('<error xmlns="" data="E_URLLINK_NO_SUCH_RESOURCE resid urn:uuid:00000000-1002-0000-0009-') or r.text.startswith('<error xmlns="" data="E_URLLINK_NO_DISTRIBUTION_RIGHTS urn:uuid:00000000-1002-0000-0009-'):
+ isbn = int([x for x in r.text.split() if x.startswith("urn:uuid:00000000-1002-0000-0009-")][0].split("-").pop())+int(9e12)
+ borrow = Borrow(id=acsm_id, isbn=isbn, obtained=int(time()))
+ logger.warning(f"received either 'no such resource' or 'no distribution rights' from server and stored a quite empty {borrow}")
+ session.add(borrow)
+ session.commit()
+ only_isbn_acsms += 1
+ elif r.text.startswith('<error xmlns="" data="E_URLLINK_PARAMETER_SYNTAX rights lrt"/>'):
+ logger.warning(f"received urllink parameter syntax error with no usable data for acsm {acsm_id}, so I did not store anything")
+ force_acsm_id = acsm_id+1
+ if acsm_id >= 199999 and acsm_id <= 999999:
+ logger.warning(f"on 2022-11-07, library removed access for acsms 200000-999999. skipping to 1000000")
+ force_acsm_id = 1000000
+ failed_acsms += 1
+ else:
+ try:
+ acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8")
+ except FeatureNotFound:
+ raise FeatureNotFound("pip3 install lxml")
+ ft = acsm.fulfillmentToken
+ transaction = None
+ expected = f"ACS-BIBL-L-{acsm_id}"
+ if ft.transaction.string != expected:
+ transaction = ft.transaction.string
+"expected {expected} in transaction.string, but instead received {ft.transaction.string} in acsm {acsm_id}")
+ isbn = int(ft.resourceItemInfo.resource.string.split("-").pop())+int(9e12)
+ identifier_is_isbn = True
+ identifier_to_isbn = 0
+ identifier = "noidentifier"
+ try:
+ identifier = ft.resourceItemInfo.metadata.identifier.string
+ identifier_to_isbn = int(identifier.split(":").pop().replace("-", ""))
+ except (ValueError, AttributeError):
+ identifier_is_isbn = False
+ if identifier_to_isbn == 0:
+ identifier_is_isbn = False
+ expected = ft.resourceItemInfo.resource.string
+ if ft.licenseToken.resource.string != expected:
+ raise ValueError(f"expected {expected} in ft.resourceItemInfo.licenseToken.resource.string but instead received {ft.resourceItemInfo.licenseToken.resource.string} in acsm {acsm_id}")
+ uuid = expected.split(":").pop()
+ expected = f"{uuid}."
+ try:
+ if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True:
+ raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}")
+ thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop()
+ except AttributeError:
+ thumbnail_extension = None
+ if ft.resourceItemInfo.metadata.thumbnailURL != None:
+ raise ValueError(f"thumbnailURL actually exists, but it failed to be parsed in acsm {acsm_id}")
+ duration = int(ft.resourceItemInfo.licenseToken.permissions.display.duration.string)
+ if duration != int(
+ raise ValueError(f"expected {duration} in but instead received {int(} in acsm {acsm_id}")
+ hmac = b64decode(ft.hmac.string, validate=True)
+ title = ft.resourceItemInfo.metadata.find(name="dc:title").string
+ creator = ft.resourceItemInfo.metadata.creator.string
+ publisher = ft.resourceItemInfo.metadata.publisher.string
+ language = ft.resourceItemInfo.metadata.language.string
+ format = ft.resourceItemInfo.metadata.format.string
+ purchase_utc = datetime.strptime(ft.purchase.string, "%Y-%m-%dT%H:%M:%S%z")
+ expiration_utc = datetime.strptime(ft.expiration.string, "%Y-%m-%dT%H:%M:%S%z")
+ purchase_timezone = purchase_utc.tzinfo.utcoffset(None).seconds
+ expiration_timezone = expiration_utc.tzinfo.utcoffset(None).seconds
+ purchase_utc = purchase_utc.astimezone(timezone.utc).replace(tzinfo=None)
+ expiration_utc = expiration_utc.astimezone(timezone.utc).replace(tzinfo=None)
+ if identifier_is_isbn:
+ identifier = None
+ book = session.get(Book, isbn)
+ if book == None:
+ book = Book(identifier=identifier, isbn=isbn, title=title, creator=creator, publisher=publisher, thumbnail_extension=thumbnail_extension, language=language, format=format)
+ else:
+ book.identifier = identifier
+ book.isbn = isbn
+ book.title = title
+ book.creator = creator
+ book.publisher = publisher
+ book.thumbnail_extension = thumbnail_extension
+ book.language = language
+ book.format = format
+ borrow = Borrow(id=acsm_id, isbn=isbn, purchase_utc=purchase_utc, expiration_utc=expiration_utc, obtained=int(time()), book=book, transaction=transaction, purchase_timezone=purchase_timezone, expiration_timezone=expiration_timezone, duration=duration)
+"found a new {borrow!r}")
+ session.add(borrow)
+ session.commit()
+ valid_acsms += 1
+"In this update, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received with response code 200 and {failed_acsms_not200} acsms failed to be received but did not return 200. Last valid requested acsm was {acsm_id}. Thank you for cooperation.")
+ return {"valid_acsms": valid_acsms, "only_isbn_acsms": only_isbn_acsms, "failed_acsms": failed_acsms, "failed_acsms_not200": failed_acsms_not200, "acsm_id": acsm_id}
+if __name__ == "__main__":
+ if len(argv) != 1+2:
+ raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address")
+ engine = create_engine(argv[1], echo=True, future=True)
+ Base.metadata.create_all(engine)
+ logger.debug(f"created metadata.")
+ try:
+ r = update(engine)
+ except KeyboardInterrupt:
+ logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.")