diff --git a/orcid-disambiguation/README.md b/orcid-disambiguation/README.md new file mode 100644 index 0000000..7eec59a --- /dev/null +++ b/orcid-disambiguation/README.md @@ -0,0 +1,45 @@ +# orcid-disambiguation + +Cette instance propose un outil de de désambiguisation d'auteur orcid + +## Description du programme + + +## Utilisation + +- [v1/orcidDisambiguation](#v1%2forcidDisambiguation) + + + +## Exemple + +```bash +$ cat < self.worksDepth: + break + workSummary = work.get('work-summary') + for ws in workSummary : + dic["putcode"].append(ws.get("put-code")) + dic["title"].append(ws.get("title").get('title').get("value")) + try: + dic["publicationDate"].append(ws.get("publication-date").get("year").get('value')) + except: + dic["publicationDate"].append("Err") + try: + dic["journal"].append(ws.get("journal-title")) + except: + dic["journal"].append("Err") + + try: + dic["externalIds"].append([external_id['external-id-value'] for external_id in ws.get("external-ids")["external-id"]]) + except: + dic["externalIds"].append(["Err"]) + return dic + + def splitFirstName(self,name): + choice = [] + nameSplit = name.lower().split(" ") + try: + while True: + nameSplit.remove("") + except ValueError: + pass + if len(nameSplit) > 2: + nameSplit = [" ".join(nameSplit[:-1]),nameSplit[-1]] + choice.append(name.lower()) + choice.append(nameSplit[1]+" "+nameSplit[0]) + fnSplit = nameSplit[0].split("-") #composed firstname with - in it + if len(fnSplit) > 1: #composed firstname with - in it + choice.append(nameSplit[1]+", "+fnSplit[0][0]+".-"+fnSplit[1][0]+".") + choice.append(nameSplit[1]+", "+fnSplit[0][0]+"."+fnSplit[1][0]+".") + else: #composed first name with space in it + fnSplit = nameSplit[0].split(" ") + if len(fnSplit) > 1: + choice.append(nameSplit[1]+", "+fnSplit[0]+".-"+fnSplit[1][0]+".") #lastname, Jaa.-P. + choice.append(nameSplit[1]+", "+fnSplit[0]+" "+fnSplit[1][0]+".")#lastname, Jaa P. + choice.append(nameSplit[1]+", "+fnSplit[0][0]+"."+fnSplit[1][0]+".") #lastname, Jaa.P. + choice.append(fnSplit[0]+" "+fnSplit[1][0]+". "+nameSplit[1])#Jaa P. lastname + choice.append(fnSplit[0][0]+"."+fnSplit[1][0]+". "+nameSplit[1])#J.P. lastname + choice.append(fnSplit[0][0]+fnSplit[1][0]+". "+nameSplit[1])#JP. lastname + else: + choice.append(nameSplit[1]+", "+nameSplit[0][0]+".") + + if len(fnSplit) == 1: #composed first name with space in it + fnSplit = nameSplit[0].split("-") + return choice + + def checkEmail(self,email): + for em in self.infoDic["email"]: + if em == email: + return True,em + return False,0 + + def disambiguation(self): + df = self.getDfFromName(self.infoDic["firstName"]+"+AND+"+self.infoDic["lastName"]) + time.sleep(self.timeBetweenrequest) + personsInfos = self.extractDfInfos(df) + for personInfos in personsInfos: + print("Checking ",personInfos["firstName"]," ",personInfos["lastName"]) + orcid = personInfos["orcid"] + matchArg = [] + points = 0 + + if "email" in self.infoDic: #check email + print("Checking email") + end,em = self.checkEmail(personInfos["email"]) + if end: + return [[orcid,["Email "+em],100]] + + works = self.getWorksFromOrcid(orcid) + time.sleep(self.timeBetweenrequest) + worksInfo = self.extractInfoFromWorks(works) + + """ + if "titles" in self.infoDic: #check title + print("Checking titles") + for title in self.infoDic["titles"]: + if title.lower() in [title.lower() for title in worksInfo["title"]]: + return [orcid,["title "+title],100] + """ + if "titles" in self.infoDic: #check title + print("Checking titles") + for title in self.infoDic["titles"]: + for tit in [title.lower() for title in worksInfo["title"]]: + ratio = SequenceMatcher(None, title.lower(), tit).ratio() #check similarity between title + if ratio > 0.7: + print("matching title ",ratio) + return [[orcid,["title "+tit],100]] + + if "coAuthors" in self.infoDic: #check coAuthors + print("Checking Co-authors") + authorsPutcode = [] + for putcode in worksInfo["putcode"]: + authorsPutcode += self.getCoAuthorsFromPutcode(putcode,orcid) + time.sleep(self.timeBetweenrequest) + authors = list(set(authorsPutcode)) + for author in self.infoDic["coAuthors"]: + choices = self.splitFirstName(author) + for choice in choices: + if choice in [auth.lower() for auth in authors]: + return [[orcid,["Co-authors "+choice],100]] + + if "affiliations" in self.infoDic: #check affiliations + print("Checking affiliations") + for affiliation in self.infoDic["affiliations"]: + if affiliation.lower() in [aff.lower() for aff in personInfos["affiliations"]]: + matchArg.append("Affiliation "+affiliation) + points += 10 + + #check first and last name + if self.infoDic["lastName"].lower() == personInfos["lastName"].lower(): + if self.infoDic["firstName"].lower() == personInfos["firstName"].lower(): + points += 10 + matchArg.append("Match First+LastName ") + elif self.infoDic["firstName"].lower()[0] == personInfos["firstName"].lower()[0]: + points += 7 + matchArg.append("Match FirstName First Letter+LastName ") + else: + points += 5 + matchArg.append("Match LastName ") + + personInfos["points"] = points + personInfos["matchArg"] = matchArg + + #check first and last name + finalReturn = [] + for personInfos in personsInfos: + if personInfos["points"] != 0: + finalReturn.append([personInfos["orcid"],personInfos["matchArg"],personInfos["points"]]) + + finalReturn.sort(key=getPoints,reverse=True) + return finalReturn diff --git a/orcid-disambiguation/v1/orcidDisambiguation.ini b/orcid-disambiguation/v1/orcidDisambiguation.ini new file mode 100644 index 0000000..5a05812 --- /dev/null +++ b/orcid-disambiguation/v1/orcidDisambiguation.ini @@ -0,0 +1,52 @@ +# OpenAPI Documentation - JSON format (dot notation) +post.responses.default.description = Return all objects with enrich fields +post.responses.default.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.summary = Disambiguation of Orcid author +post.requestBody.required = true +post.requestBody.content.application/json.schema.$ref = #/components/schemas/JSONStream +post.parameters.0.in = query +post.parameters.0.name = path +post.parameters.0.schema.type = string +post.parameters.0.description = The path in each object to enrich with an Python script +post.parameters.1.in = query +post.parameters.1.name = indent +post.parameters.1.schema.type = boolean +post.parameters.1.description = Indent or not the JSON Result + +post.parameters.2.in = query +post.parameters.2.name = nameDepth +post.parameters.2.schema.type = int +post.parameters.2.description = Maximum number of people to check + +post.parameters.2.in = query +post.parameters.2.name = worksDepth +post.parameters.2.schema.type = int +post.parameters.2.description = Maximum number of works we take for a person + +[use] +plugin = @ezs/spawn +plugin = @ezs/basics +plugin = @ezs/storage +plugin = @ezs/analytics + +[JSONParse] +legacy = false +separator = $ + +[expand] +path = env('path', 'value') +size = 100 +# in production mode, uncomment the following line +# cache = boost + +[expand/exec] +# command should be executable ! +command = ./v1/orcidDisambiguation.py +args = fix('-p') +args = env('nameDepth',20) +args = fix('-q') +args = env('worksDepth',20) +#command = ./expand.py + +[dump] +indent = env('indent', false) \ No newline at end of file diff --git a/orcid-disambiguation/v1/orcidDisambiguation.py b/orcid-disambiguation/v1/orcidDisambiguation.py new file mode 100644 index 0000000..b988b6d --- /dev/null +++ b/orcid-disambiguation/v1/orcidDisambiguation.py @@ -0,0 +1,26 @@ +#!/usr/bin/python3 +from disambiguate import disambiguate +import sys +import json +import plac + +@plac.annotations( + nameDepth = ("Maximum number of people to check" ,"option", "p", int ), + worksDepth = ("Maximum number of works we take for a person" ,"option", "q", int ), +) + +def main(nameDepth = 20, worksDepth = 20): + for line in sys.stdin: + data = json.loads(line) + infos = data['value'] + db = disambiguate(infos,nameDepth=nameDepth, worksDepth=worksDepth ) + result = db.disambiguation() + if len(result)>0: + data['value'] = str(result[0][0]) + else: + data['value'] = "None" + sys.stdout.write(json.dumps(data)) + sys.stdout.write('\n') + +if __name__ == "__main__": + plac.call(main) \ No newline at end of file