import json
from ..script import record
from typing import Any, Optional
import requests
DATA_PATH = "./data/data.jsonl"
LOADED_CORPUS = (
"https://loaded-corpus.data.istex.fr/api/export/jsonallvalue?sortDir=ASC"
)
CORPUS_ID_FIELD = "JP7t" # Identifiant du champ lodex contenant l'indentifiant
MappingData = dict[str, str]
corpus: MappingData = {}
JsonData = list[dict[str, Any]]
def get_loaded_corpus() -> JsonData:
"""Récupère les données de l'instance loaded corpus"""
data: JsonData = requests.get(LOADED_CORPUS).json()
return data
def process_loaded_corpus(json: JsonData) -> MappingData:
"""Transforme les données de l'instance loaded corpus en un mapping identifiant du bouquet -> identifiant AKR du corpus dans loaded corpus"""
id_to_ark: MappingData = {}
for elem in json:
print(type(elem))
try:
ark: str = elem["uri"]
corpus_id: str = list(
filter(lambda f: (f["name"] == CORPUS_ID_FIELD), elem["fields"])
)[0]["value"]
id_to_ark[corpus_id] = ark
except:
pass
return id_to_ark
def parse_data(data: str) -> RecordData | None:
"""Retourne un InputRecord"""
try:
data = json.loads(data)
try:
date = next(iter(data))
enrichment: str = next(iter(data.get(date).get("enrichments.type")))
corpus_name: str = next(
iter(
data.get(date)
.get("enrichments.type")
.get(enrichment)
.get("corpusName")
.get("aggregations")
)
)
nb: int = (
data.get(date)
.get("enrichments.type")
.get(enrichment)
.get("corpusName")
.get("aggregations")
.get(corpus_name)
)
return RecordData(corpus_name, date, nb, enrichment)
except:
date = next(iter(data))
corpus_name: str = next(
iter(data.get(date).get("aggregations").get("corpusName"))
)
nb: int = (
data.get(date).get("aggregations").get("corpusName").get(corpus_name)
)
return RecordData(corpus_name, date, nb, "nb_corpus")
except:
return None
with open(DATA_PATH) as fp:
enrichments: list[RecordData] = list()
corpus_stats: dict[tuple[str, str], int] = {}
while True:
line = fp.readline()
if not line:
break
parsed_data: RecordData | None = parse_data(line)
if parse_data:
if parse_data.type == "nb_corpus":
key: tuple[str, str] = (parsed_data.corpus, parsed_data.date)
corpus_stats[key] = result.nb
else:
enrichments.append(result)
for enrichment in enrichments:
nb_corpus: int | None = corpus_stats.get(
(enrichment.corpus, enrichment.date)
)
if nb_corpus:
enrichment.set_nb_corpus(nb_corpus)
# _ = get_loaded_corpus()