#!/usr/bin/python3
from sys import argv
import logging
from time import localtime, mktime, time
import requests
from base64 import b64decode
try:
from sqlalchemy import Table, MetaData, Integer, BigInteger, String, Column, Table, ForeignKey, create_engine, select
from sqlalchemy.orm import declarative_base, relationship, Session
except ModuleNotFoundError:
raise ModuleNotFoundError("emerge dev-python/sqlalchemy or pip install SQLAlchemy")
try:
from bs4 import BeautifulSoup, FeatureNotFound
except ModuleNotFoundError:
raise ModuleNotFoundError("emerge dev-python/beautifulsoup4 or pip install beautifulsoup4")
if len(argv) != 1+2:
raise ValueError("1st argument specifies the db URL in sqlalchemy format. example for sqlite: sqlite:///db, 2nd argument is operator contact that's sent via http in user-agent, for example mailto:email@address")
operator_contact = argv[2]
engine = create_engine(argv[1], echo=True, future=True)
Base = declarative_base()
class Book(Base):
__tablename__ = "books"
isbn = Column(BigInteger, primary_key=True, nullable=False, doc="book isbn. found in URL http://www/isbn/978 and in acsm: resource, dc:identifier (sometimes not), thumbnailURL")
title = Column(String, nullable=True, doc="title of the book, dcc:title in acsm")
creator = Column(String, nullable=True, doc="author of the book, dc:creator in acsm")
publisher = Column(String, nullable=True, doc="publisher of the book, dc:publisher in acsm")
identifier = Column(String, nullable=True, doc="if dc:identifier can't be derived from isbn, it's stored here. if dc:identifier element is missing, a literal string noidentifier is stored.")
thumbnail_extension = Column(String, nullable=True, doc="thumbnails come in predictable URLs, derived from ISBN, apart from the extension. I've observed both jpg and png, may be None if there's no thumbnailURL element")
format = Column(String, nullable=True, doc="format of the file. I've seen application/pdf and application/epub+zip")
language = Column(String, nullable=True, doc="language of the book. I've seen sl.")
borrows = relationship("Borrow", back_populates="book");
def __repr__(self):
return f"Book(isbn={self.isbn!r}, title={self.title!r}, creator={self.creator!r}, publisher={self.publisher!r})"
class Borrow(Base):
__tablename__ = "borrows"
id = Column(Integer, primary_key=True, nullable=False, doc="id in transaction element of acsm or in filename of acsm on http")
isbn = Column(ForeignKey("books.isbn"), nullable=False, doc="foreign key that leads to a book")
transaction = Column(String, nullable=True, doc="transaction element content, but only if it couldn't be derived from format ACS-BIBL-L-{acsm_id}, otherwise Null")
purchase = Column(String, nullable=True, doc="acsm purchase element: iso8601 of purchase of book, including timezone")
expiration = Column(String, nullable=True, doc="acsm expiration element: iso8601 of expiration of acsm, including timezone")
obtained = Column(BigInteger, nullable=False, doc="UNIX timestamp when this borrow was obtained as acsm from http")
book = relationship("Book", back_populates="borrows")
def __repr__(self):
return f"Borrow(id={self.id!r}, isbn={self.isbn!r}, purchase={self.purchase!r}, expiration={self.expiration!r}, obtained=mktime({localtime(self.obtained)!r}), book={self.book!r})"
logging.basicConfig(level=logging.NOTSET)
logger = logging.getLogger(argv[0])
logger.debug("welcome to %s", argv[0])
Base.metadata.create_all(engine)
starting_acsm_id = 177238
guaranteed_large_acsm_id = 1170487
logger.debug(f"created metadata.")
force_acsm_id = 0
valid_acsms = 0
only_isbn_acsms = 0
failed_acsms = 0
failed_acsms_not200 = 0
failed_acsms_not200_in_a_row = 0
hmfan2iarts = 100 # how many failed acsms not 200 in a row to stop
try:
with Session(engine) as session:
while True:
if force_acsm_id != 0:
acsm_id = force_acsm_id
force_acsm_id = 0
else:
borrow = session.scalars(select(Borrow).order_by(Borrow.id.desc()).limit(1)).first()
acsm_id = starting_acsm_id
if borrow is None:
logger.info(f"oooh, it looks like this is a fresh start, db contains no borrows. I'll start with hardcoded acsm id {starting_acsm_id}")
else:
logger.info(f"continuing from latest {borrow}")
acsm_id = borrow.id+1
r = requests.get(f"https://www.biblos.si/izposoja/prenesi/{acsm_id}.acsm", headers={"User-Agent": f"python-requests/{requests.__version__} (biblos-stat acsm scraper, contact operator: {operator_contact})"})
r.encoding = "UTF-8"
if (r.status_code == 200):
failed_acsms_not200_in_a_row = 0
if r.status_code != 200:
logger.warning(f"received http response with error code not 200 (it is {r.status_code}). if this continues for {hmfan2iarts-failed_acsms_not200_in_a_row} more requests, I'll assume there are no more borrows on the server.")
failed_acsms_not200 += 1
failed_acsms_not200_in_a_row += 1
force_acsm_id = acsm_id+1
if failed_acsms_not200_in_a_row == hmfan2iarts:
logger.info(f"we are done for now, as server responded with {r.status_code} for queried acsm id {acsm_id}, which means {hmfan2iarts} concurrent responses that are not 200.")
if acsm_id < guaranteed_large_acsm_id:
logger.error(f"this shouldn't happen. I have a hardcoded value that tells me that at time of program writing, acsm id {guaranteed_large_acsm_id} did exist on the server. dying anyways.")
break
elif r.text.startswith("Napaka pri prenosu"):
logger.warning(f"'napaka pri prenosu' received from http for acsm id {acsm_id}, skipping")
force_acsm_id = acsm_id+1
elif r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_NO_SUCH_RESOURCE resid urn:uuid:00000000-1002-0000-0009-') or r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_NO_DISTRIBUTION_RIGHTS urn:uuid:00000000-1002-0000-0009-'):
isbn = int([x for x in r.text.split() if x.startswith("urn:uuid:00000000-1002-0000-0009-")][0].split("-").pop())+int(9e12)
borrow = Borrow(id=acsm_id, isbn=isbn, obtained=int(time()))
logger.warning(f"received either 'no such resource' or 'no distribution rights' from server and stored a quite empty {borrow}")
session.add(borrow)
session.commit()
only_isbn_acsms += 1
elif r.text.startswith('<error xmlns="http://ns.adobe.com/adept" data="E_URLLINK_PARAMETER_SYNTAX rights lrt http://cs.alliance.inkbook.eu:443/fulfillment/URLLink.acsm"/>'):
logger.warning(f"received urllink parameter syntax error with no usable data for acsm {acsm_id}, so I did not store anything")
force_acsm_id = acsm_id+1
if acsm_id >= 199999 and acsm_id <= 999999:
logger.warning(f"on 2022-11-07, library removed access for acsms 200000-999999. skipping to 1000000")
force_acsm_id = 1000000
failed_acsms += 1
else:
try:
acsm = BeautifulSoup(r.text, "xml", from_encoding="UTF-8")
except FeatureNotFound:
raise FeatureNotFound("pip3 install lxml")
ft = acsm.fulfillmentToken
transaction = None
expected = f"ACS-BIBL-L-{acsm_id}"
if ft.transaction.string != expected:
transaction = ft.transaction.string
logger.info(f"expected {expected} in transaction.string, but instead received {ft.transaction.string} in acsm {acsm_id}")
isbn = int(ft.resourceItemInfo.resource.string.split("-").pop())+int(9e12)
identifier_is_isbn = True
identifier_to_isbn = 0
identifier = "noidentifier"
try:
identifier = ft.resourceItemInfo.metadata.identifier.string
identifier_to_isbn = int(identifier.split(":").pop().replace("-", ""))
except (ValueError, AttributeError):
identifier_is_isbn = False
if identifier_to_isbn == 0:
identifier_is_isbn = False
expected = ft.resourceItemInfo.resource.string
if ft.licenseToken.resource.string != expected:
raise ValueError(f"expected {expected} in ft.resourceItemInfo.licenseToken.resource.string but instead received {ft.resourceItemInfo.licenseToken.resource.string} in acsm {acsm_id}")
uuid = expected.split(":").pop()
expected = f"https://cs.alliance.inkbook.eu/books/{uuid}."
try:
if ft.resourceItemInfo.metadata.thumbnailURL.string.startswith(expected) != True:
raise ValueError(f"expected {expected} in ft.resourceItemInfo.metadata.thumbnailURL.string but instead received {ft.resourceItemInfo.metadata.thumbnailURL.string} in acsm {acsm_id}")
thumbnail_extension = ft.resourceItemInfo.metadata.thumbnailURL.string.split(".").pop()
except AttributeError:
thumbnail_extension = None
if ft.resourceItemInfo.metadata.thumbnailURL != None:
raise ValueError(f"thumbnailURL actually exists, but it failed to be parsed in acsm {acsm_id}")
duration = int(ft.resourceItemInfo.licenseToken.permissions.display.duration.string)
if duration != int(ft.resourceItemInfo.licenseToken.permissions.play.duration.string):
raise ValueError(f"expected {duration} in fr.int(resourceItemInfo.licenseToken.permissions.play.duration.string) but instead received {int(resourceItemInfo.licenseToken.permissions.play.duration.string)} in acsm {acsm_id}")
hmac = b64decode(ft.hmac.string, validate=True)
title = ft.resourceItemInfo.metadata.find(name="dc:title").string
creator = ft.resourceItemInfo.metadata.creator.string
publisher = ft.resourceItemInfo.metadata.publisher.string
language = ft.resourceItemInfo.metadata.language.string
format = ft.resourceItemInfo.metadata.format.string
purchase = ft.purchase.string
expiration = ft.expiration.string
if identifier_is_isbn:
identifier = None
book = session.get(Book, isbn)
if book == None:
book = Book(identifier=identifier, isbn=isbn, title=title, creator=creator, publisher=publisher, thumbnail_extension=thumbnail_extension, language=language, format=format)
else:
book.identifier = identifier
book.isbn = isbn
book.title = title
book.creator = creator
book.publisher = publisher
book.thumbnail_extension = thumbnail_extension
book.language = language
book.format = format
borrow = Borrow(id=acsm_id, isbn=isbn, purchase=purchase, expiration=expiration, obtained=int(time()), book=book, transaction=transaction)
logger.info(f"found a new {borrow!r}")
session.add(borrow)
session.commit()
valid_acsms += 1
except KeyboardInterrupt:
logger.warning(f"Keyboard interrupt. Exiting. I hope this terminated cleanly. Last requested acsm was discarded.")
logger.info(f"In this session, {valid_acsms} valid acsms were stored, {only_isbn_acsms} acsms had only isbn and no other data available and {failed_acsms} acsms failed to be received with response code 200 and {failed_acsms_not200} acsms failed to be received but did not return 200. Last valid requested acsm was {acsm_id}. Thank you for cooperation.")