diff --git a/data-computer/README.md b/data-computer/README.md index 87a0064..e5fe779 100644 --- a/data-computer/README.md +++ b/data-computer/README.md @@ -157,105 +157,35 @@ ### v1/lda -Créer à partir de l'ensemble des documents un champ "lda" constitué de 5 topics. Chaque topic contient un champ "word", qui est composé une liste de 10 mots qui sont les plus caractéristiques du topic, ainsi que d'un champ "weight" qui correspond au poids associé au sujet dans le document. Le texte doit être en anglais. +Créer à partir de l'ensemble des documents un champ "lda" constitué de 1 à 20 topics. Chaque topic contient un champ "word", qui est composé une liste de 10 mots qui sont les plus caractéristiques du topic, ainsi que d'un champ "weight" qui correspond au poids associé au sujet dans le document. Le texte doit être en anglais. Les topics non exhaustifs (dont la probabilité est inférieure ou égale à 0.05) ne sont pas retournés. -Par exemple, pour un document pris dans un ensemble de document (l'id "85" est totalement arbitraire) +Par exemple, pour un document pris dans un ensemble de document (l'id "83" est totalement arbitraire) ```json + { - "id": 85, - "value": "During my culinary adventure through the bustling markets of Marrakech, where the scent of exotic spices hung in the air and vendors beckoned with colorful displays of fruits and textiles, I savored tagines, couscous, and mint tea, discovering the rich tapestry of Moroccan flavors." +"id":"83", +"value":"The current status and distribution of the red panda Ailurus fulgens in the wild is poorly known. The subspecies fulgens is found in the Himalaya in Nepal, India, Bhutan, northern Myanmar and southwest China, and the subspecies styani occurs further to the east in south-central China. The red panda is an animal of subtropical and temperate forests, with the exception of Meghalaya in India, where it is also found in tropical forests. In the wild, red pandas take a largely vegetarian diet consisting chiefly of bamboo. The extent of occurrence of the red panda in India is about 170,000 sq km, although its area of occupancy within this may only be about 25,000 sq km. An estimate based on the lowest recorded average density and the total area of potential habitat suggests that the global population of red pandas is about 16,000–20,000. Habitat loss and poaching, in that order, are the major threats. In this paper the distribution, status and conservation problems of the red panda, especially in India, are reviewed, and appropriate conservation measures recommended, including the protection of named areas and the extension of some existing protected areas." } ``` On obtiendra : ```json -{ - "id":85, - "value": "During my culinary adventure through the bustling markets of Marrakech, where the scent of exotic spices hung in the air and vendors beckoned with colorful displays of fruits and textiles, I savored tagines, couscous, and mint tea, discovering the rich tapestry of Moroccan flavors.", - "lda": { - "topic_1": { - "words": [ - "sky", - "tranquil", - "yellow", - "solace", - "symphony", - "leave", - "bird", - "taxi", - "cityscape", - "provide" - ], - "weight": "0.0133591" - }, - "topic_2": { - "words": [ - "bustling", - "air", - "savor", - "tapestry", - "rich", - "adventure", - "tea", - "discover", - "flavor", - "hang" - ], - "weight": "0.94660753" - }, - "topic_3": { - "words": [ - "street", - "air", - "cottage", - "quaint", - "melodic", - "seaside", - "water", - "shore", - "collect", - "sandy" - ], - "weight": "0.013361818" - }, - "topic_4": { - "words": [ - "forest", - "atmosphere", - "leave", - "filter", - "tale", - "tower", - "create", - "floor", - "enchant", - "shadow" - ], - "weight": "0.013335978" - }, - "topic_5": { - "words": [ - "mystery", - "sky", - "embark", - "ponder", - "gaze", - "overwhelming", - "light", - "mountaintop", - "night", - "universe" - ], - "weight": "0.013335522" - } - } -} +{ +"id":"83", +"value":"The current status and distribution of the red panda Ailurus fulgens in the wild is poorly known. The subspecies fulgens is found in the Himalaya in Nepal, India, Bhutan, northern Myanmar and southwest China, and the subspecies styani occurs further to the east in south-central China. The red panda is an animal of subtropical and temperate forests, with the exception of Meghalaya in India, where it is also found in tropical forests. In the wild, red pandas take a largely vegetarian diet consisting chiefly of bamboo. The extent of occurrence of the red panda in India is about 170,000 sq km, although its area of occupancy within this may only be about 25,000 sq km. An estimate based on the lowest recorded average density and the total area of potential habitat suggests that the global population of red pandas is about 16,000–20,000. Habitat loss and poaching, in that order, are the major threats. In this paper the distribution, status and conservation problems of the red panda, especially in India, are reviewed, and appropriate conservation measures recommended, including the protection of named areas and the extension of some existing protected areas.", +"lda":{ + +"topic_6":{"words":["diet","animal","high","group","level","study","blood","dietary","intake","increase"],"weight":"0.9416929", + +"topic_13":{"words":["diet","intake","human","b12","food","level","protein","vitamin","increase","acid"],"weight":"0.05131816"} +} ``` -NOTE : l'algorithme a besoin de beaucoup de documents pour fonctionner (plus d'une centaine), d'où la non exhaustivité de l'exemple. +NOTE : la qualité des résultats ne peut être assurée sur un petit corpus. + #### Paramètre(s) URL diff --git a/data-wrapper/v1/csv.ini b/data-wrapper/v1/csv.ini index 22af3c9..e9ead95 100644 --- a/data-wrapper/v1/csv.ini +++ b/data-wrapper/v1/csv.ini @@ -5,7 +5,7 @@ # OpenAPI Documentation - JSON format (dot notation) post.operationId = post-v1-csv post.description = Transformation d'un fichier CSV en fichier corpus -post.summary = Le fichier est transformé en fichier coprus exploitable par un web service asynchrone, chaque ligne est réduite à un object contenant 2 champs (id, value) +post.summary = Le fichier est transformé en fichier coprus exploitable par un web service asynchrone post.tags.0 = data-wrapper post.requestBody.content.text/csv.schema.type = string post.requestBody.content.text/csv.schema.format = binary @@ -13,18 +13,28 @@ post.responses.default.description = Fichier corpus au format tar.gz post.responses.default.content.application/x-tar.schema.type = string post.responses.default.content.application/x-tar.schema.format = binary -post.parameters.0.description = Nom du champ à exploiter comme identifiant de ligne +post.parameters.0.description = Nom du champ à exploiter comme identifiant de colonne post.parameters.0.in = query -post.parameters.0.name = value +post.parameters.0.name = id post.parameters.0.schema.type = string -post.parameters.0.schema.default = value +post.parameters.0.schema.default = id post.parameters.0.required = false -post.parameters.1.description = Nom du champ à exploiter comme identifiant de colonne +post.parameters.1.description = Nom du champ à exploiter comme identifiant de ligne post.parameters.1.in = query -post.parameters.1.name = id +post.parameters.1.name = value post.parameters.1.schema.type = string -post.parameters.1.schema.default = id +post.parameters.1.schema.default = value post.parameters.1.required = false +post.parameters.2.description = chaque ligne est réduite à un object contenant 2 champs (id, value) +post.parameters.2.in = query +post.parameters.2.name = slim +post.parameters.2.schema.type = boolean +post.parameters.2.schema.default = true +post.parameters.2.required = false + +[env] +path = slim +value = env('slim').thru(x => (x === 'false' ? false : true)) [use] plugin = basics @@ -33,12 +43,15 @@ [CSVObject] -[replace] +[assign] path = id value = get(env('id', 'id')) path = value value = get(env('value', 'value')) +[exchange] +value = self().thru(x => _.env('slim') ? _.pick(x, ['id', 'value']) : x) + [TARDump] compress = true manifest = fix({version: '1'}) diff --git a/data-wrapper/v1/istex-tar-gz.ini b/data-wrapper/v1/istex-tar-gz.ini new file mode 100644 index 0000000..95ef476 --- /dev/null +++ b/data-wrapper/v1/istex-tar-gz.ini @@ -0,0 +1,60 @@ +# Entrypoint output format +mimeType = application/x-tar +extension = tar.gz + +# OpenAPI Documentation - JSON format (dot notation) +post.operationId = post-v1-tar-gz +post.description = Transformation d'un fichier ISTEX (format tar.gz) en fichier corpus +post.summary = Le fichier est transformé en fichier coprus exploitable par un web service asynchrone +post.tags.0 = data-wrapper +post.requestBody.content.text/csv.schema.type = string +post.requestBody.content.text/csv.schema.format = binary +post.requestBody.required = true +post.responses.default.description = Fichier corpus au format tar.gz +post.responses.default.content.application/x-tar.schema.type = string +post.responses.default.content.application/x-tar.schema.format = binary +post.parameters.0.description = Nom du champ à exploiter contenant l'identifiant +post.parameters.0.in = query +post.parameters.0.name = id +post.parameters.0.schema.type = string +post.parameters.0.schema.default = ark.0 +post.parameters.0.required = false +post.parameters.1.description = Nom du champ à exploiter contenant la valeur +post.parameters.1.in = query +post.parameters.1.name = value +post.parameters.1.schema.type = string +post.parameters.1.schema.default = abstract +post.parameters.1.required = false +post.parameters.2.description = chaque objet est réduit à un object contenant 2 champs (id, value) +post.parameters.2.in = query +post.parameters.2.name = slim +post.parameters.2.schema.type = boolean +post.parameters.2.schema.default = true +post.parameters.2.required = false + + +[env] +path = slim +value = env('slim').thru(x => (x === 'false' ? false : true)) + +[use] +plugin = basics + +[TARExtract] +compress = true +path = */*.json + +[assign] +path = id +value = get(env('id', 'ark.0')) + +path = value +value = get(env('value', 'abstract')) + +[exchange] +value = self().thru(x => _.env('slim') ? _.pick(x, ['id', 'value']) : x) + +[TARDump] +compress = true +manifest = fix({version: '1'}) +manifest = fix({generator: 'v1/istex-tar-gz'})