Newer
Older
istex-enrich-monitoring / script / lodex-export / loaded-corpus.py
@Mathis EON Mathis EON on 18 Jun 2024 3 KB wip
import json
from ..script import record
from typing import Any, Optional
import requests

DATA_PATH = "./data/data.jsonl"
LOADED_CORPUS = (
    "https://loaded-corpus.data.istex.fr/api/export/jsonallvalue?sortDir=ASC"
)
CORPUS_ID_FIELD = "JP7t"  # Identifiant du champ lodex contenant l'indentifiant

MappingData = dict[str, str]
corpus: MappingData = {}


JsonData = list[dict[str, Any]]


def get_loaded_corpus() -> JsonData:
    """Récupère les données de l'instance loaded corpus"""
    data: JsonData = requests.get(LOADED_CORPUS).json()
    return data


def process_loaded_corpus(json: JsonData) -> MappingData:
    """Transforme les données de l'instance loaded corpus en un mapping identifiant du bouquet -> identifiant AKR du corpus dans loaded corpus"""

    id_to_ark: MappingData = {}

    for elem in json:
        print(type(elem))
        try:
            ark: str = elem["uri"]
            corpus_id: str = list(
                filter(lambda f: (f["name"] == CORPUS_ID_FIELD), elem["fields"])
            )[0]["value"]
            id_to_ark[corpus_id] = ark
        except:
            pass
    return id_to_ark


def parse_data(data: str) -> RecordData | None:
    """Retourne un InputRecord"""
    try:
        data = json.loads(data)

        try:
            date = next(iter(data))
            enrichment: str = next(iter(data.get(date).get("enrichments.type")))
            corpus_name: str = next(
                iter(
                    data.get(date)
                    .get("enrichments.type")
                    .get(enrichment)
                    .get("corpusName")
                    .get("aggregations")
                )
            )
            nb: int = (
                data.get(date)
                .get("enrichments.type")
                .get(enrichment)
                .get("corpusName")
                .get("aggregations")
                .get(corpus_name)
            )

            return RecordData(corpus_name, date, nb, enrichment)

        except:
            date = next(iter(data))
            corpus_name: str = next(
                iter(data.get(date).get("aggregations").get("corpusName"))
            )
            nb: int = (
                data.get(date).get("aggregations").get("corpusName").get(corpus_name)
            )

            return RecordData(corpus_name, date, nb, "nb_corpus")

    except:
        return None


with open(DATA_PATH) as fp:
    enrichments: list[RecordData] = list()
    corpus_stats: dict[tuple[str, str], int] = {}

    while True:
        line = fp.readline()

        if not line:
            break

        parsed_data: RecordData | None = parse_data(line)

        if parse_data:
            if parse_data.type == "nb_corpus":
                key: tuple[str, str] = (parsed_data.corpus, parsed_data.date)
                corpus_stats[key] = result.nb
            else:
                enrichments.append(result)

        for enrichment in enrichments:
            nb_corpus: int | None = corpus_stats.get(
                (enrichment.corpus, enrichment.date)
            )

            if nb_corpus:
                enrichment.set_nb_corpus(nb_corpus)

# _ = get_loaded_corpus()