diff --git "a/Niveau-2/Vieillissement_V1/Iramuteq/Pr\303\251paration_Donn\303\251es_entr\303\251e.pl" "b/Niveau-2/Vieillissement_V1/Iramuteq/Pr\303\251paration_Donn\303\251es_entr\303\251e.pl" new file mode 100644 index 0000000..4b3474f --- /dev/null +++ "b/Niveau-2/Vieillissement_V1/Iramuteq/Pr\303\251paration_Donn\303\251es_entr\303\251e.pl" @@ -0,0 +1,55 @@ +#!/usr/bin/perl +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $dir = ""; #par exemple "Vieillissement_TXT" +my $sortie = ""; #par exemple "Vieillissement_total.txt" + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "sortie=s" => \$sortie, + "dir=s" => \$dir, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $dir or not $source; + +opendir(DIR, $dir) or die "Couldn't open file, $!"; + my @files = readdir(DIR); + closedir(DIR); + my $size = @files; + +open(DATA, ">:utf8",$sortie) or +die "Couldn't open file $sortie, $!"; + +print METADATA "\"id\",\"sous_corpus\"\n"; +foreach my $file(@files){ + $file =~ /(\pL+)_\d*/; + print DATA "**** *souscorpus_$1\n"; + open(FILE, "<:encoding(UTF-8)", "$dir/$file") or die "Couldn't open file $file, $!"; + my @texte = ; + print DATA @texte; + close(FILE); +} +close(DATA); + +exit 0; + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -d dir -s sortie \n"; + +exit $code; +} \ No newline at end of file diff --git a/Niveau-2/Vieillissement_V1/R/TopicModels_Code_R b/Niveau-2/Vieillissement_V1/R/TopicModels_Code_R new file mode 100644 index 0000000..67f6dff --- /dev/null +++ b/Niveau-2/Vieillissement_V1/R/TopicModels_Code_R @@ -0,0 +1,42 @@ +#préparation des données +Viellissement <- file.path("chemin_vers_corpus")#indiquer le chemin vers le corpus +Viellissement +library(NLP)#entrer le package NLP +library(tm)#entrer le package tm +docs <- Corpus(DirSource(Viellissement))#lire le corpus +summary(docs) +docs <- tm_map(docs, toSpace, "/|@|\\|")#éliminer certains signaux spéciaux +docs <- tm_map(docs,content_transformer(tolower))#transmettre en minuscule +docs <- tm_map(docs,removeNumbers)#éliminer le nombre +docs <- tm_map(docs,removePunctuation)#éliminer la ponctuation +docs <- tm_map(docs,removeWords,stopwords("english")) #ajout de stopword proposé par package tm +docs <- tm_map(docs,removeWords,c("age","studies","study","use","study","fig","may","a..","also","among","can","..m")) #ajout de stopword proprement défini +docs <- tm_map(docs,stripWhitespace)#éliminer des espèces + +dtm <- DocumentTermMatrix(docs)#transformer le corpus en matrice de fréquence de mot(les documents comme la première ligne et les mots comme la première colonne ) +dtm + +#classement +library(topicmodels) +ap_lda <- LDA(dtm, k = 20, control = list(seed = 1234))#k est le nombre de thématiques +ap_lda +library(tidytext) +ap_topics <- tidy(ap_lda,matrix="beta")#beta est une matrice (thématiques et leurs listes de mots avec le poids) +ap_topics + +#visualisation +library(ggplot2)#entrer le package ggplot2 +library(dplyr)#entrer le package dplyr +ap_top_terms <- ap_topics %>% +group_by(topic) %>% +top_n(10,beta) %>% +ungroup() %>% +arrange(topic,-beta) + +ap_top_terms <- ap_topics %>% +group_by(topic) %>% +top_n(10,beta) %>% +ungroup() %>% +arrange(topic,-beta) +ap_top_terms %>% +ggplot(aes(term,beta,fill=factor(topic)))+geom_col(show.legend = FALSE)+facet_wrap(~ topic,scales = "free")+coord_flip() diff --git a/Niveau-2/Vieillissement_V2/Iramuteq/Correspondance_Identifient_Istex_NomRevue.pl b/Niveau-2/Vieillissement_V2/Iramuteq/Correspondance_Identifient_Istex_NomRevue.pl new file mode 100644 index 0000000..44affd9 --- /dev/null +++ b/Niveau-2/Vieillissement_V2/Iramuteq/Correspondance_Identifient_Istex_NomRevue.pl @@ -0,0 +1,62 @@ +#!/usr/bin/perl +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $wos = ""; #par exemple "Vieil_v2_wos.txt" +my $info = ""; #par exemple "J9UT.txt" + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "wos=s" => \$wos, + "info=s" => \$info, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $wos or not $info; + +open(WOS, "<:encoding(UTF-8)", $wos) or +die "Couldn't open file $wos, $!"; +open(INFO, ">:encoding(UTF-8)", $info) or +die "Couldn't open file $info, $!"; + +while (my $ligne = ) +{ + chomp($ligne); + if ($ligne =~ /^J9 /) + { + print INFO "$ligne;"; + } + if ($ligne =~ /^UT ISTEX:/) + { + print INFO "$ligne\n"; + } +} +close WOS; +close INFO; + +exit 0; + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -w wos -i info \n"; + +exit $code; +} + + + + + + diff --git "a/Niveau-2/Vieillissement_V2/Iramuteq/Pr\303\251paration_donn\303\251es_Ira.pl" "b/Niveau-2/Vieillissement_V2/Iramuteq/Pr\303\251paration_donn\303\251es_Ira.pl" new file mode 100644 index 0000000..49e1ce4 --- /dev/null +++ "b/Niveau-2/Vieillissement_V2/Iramuteq/Pr\303\251paration_donn\303\251es_Ira.pl" @@ -0,0 +1,75 @@ +#!/usr/bin/perl +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $reportoire = ""; #par exemple "Vv2_new" +my $journal = ""; #par exemple "Nom_Du_Revue.txt" +my $id = ""; #par exemple "Nom_Du_Article.txt" +my $data = ""; #par exemple "Vieillissement_V2_New.txt" + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "reportoire=s" => \$reportoire, + "journal=s" => \$journal, + "id=s" => \$id, + "data=s" => \$data, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $reportoire or not $journal or not $id or not $data; + +open(REVUE, "<:utf8", $journal) +or die "Couldn't open file $revue, $!"; +my @a = ; + +foreach my $revue (@a){ + chomp($revue); + $revue =~ s/\s/_/g; +} + +open(ID, "<:utf8", $id) or +die "Couldn't open file $id, $!"; +my @b = ; +foreach my $id (@b){ + chomp($id); +} + + +open(DATA, ">:encoding(UTF-8)",$data) +or die "Couldn't open file $data, $!"; + +print DATA "\"revue\",\"id\"\n"; + +foreach my $i(0 .. $#a){ + open(FILE, "<:utf8", "$reportoire/$b[$i]") or die "Couldn't open file $b[$i]$!"; + print DATA "**** *revue_$a[$i] *id_$b[$i]\n"; + my @texte = ; + foreach my $ligne (@texte){ + $ligne =~ s/\*//g; + } + print DATA @texte; + close(FILE); +} +close(DATA); +close(ID); + +exit 0; + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -r reportoire -j journal -i id -d data \n"; + +exit $code; +} \ No newline at end of file diff --git "a/Niveau-3/Iramuteq/Construction_Dictionnaires_Nom_Esp\303\250ce.pl" "b/Niveau-3/Iramuteq/Construction_Dictionnaires_Nom_Esp\303\250ce.pl" new file mode 100755 index 0000000..b98f19e --- /dev/null +++ "b/Niveau-3/Iramuteq/Construction_Dictionnaires_Nom_Esp\303\250ce.pl" @@ -0,0 +1,66 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; + +my $input = ""; +my $lexique = "lexique.txt"; +my $output = ""; +my $type = ""; + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "input=s" => \$input, + "lexique=s" => \$lexique, + "output=s" => \$output, + "type=s" => \$type, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +if (not $input or $output) +{ + usage(2); +} +if ($type ne "animalia" and $type ne "plantae") +{ + usage(3); +} + + +open(FILE, "<:utf8", $input) or + die "Couldn't open file $input, $!"; +open(LEXIQUE, ">:utf8", $lexique) or + die "Couldn't open file $lexique, $!"; +open(EXPRESSION, ">:utf8", $output) or + die "Couldn't open file $output, $!"; + +while (my $ligne = ) { + $ligne = lc($ligne); + $ligne =~ /(\pL*) (\pL*)\t(\pL*)/; + if($ligne =~ /(\pL*) (\pL*)\t$type/){ + print LEXIQUE "$1_$2\t$1_$2\tnom\n"; + print EXPRESSION "$1 $2\t$1_$2\n"; + } +} +close(FILE); +close(LEXIQUE); +close(EXPRESSION); + +exit 0; + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -i input -o output -t (\"animalia\"|\"plantae\") [ -l lexique ]\n"; +print STDERR " -i input : liste des espèces\n"; +print STDERR " -o output : liste des expressions espèces\n"; + +exit $code; +} \ No newline at end of file diff --git "a/Niveau-3/Iramuteq/Pr\303\251paration_Donn\303\251es_entr\303\251e_Iramuteq.pl" "b/Niveau-3/Iramuteq/Pr\303\251paration_Donn\303\251es_entr\303\251e_Iramuteq.pl" new file mode 100755 index 0000000..1875bf0 --- /dev/null +++ "b/Niveau-3/Iramuteq/Pr\303\251paration_Donn\303\251es_entr\303\251e_Iramuteq.pl" @@ -0,0 +1,77 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $dir = ""; # par exemple "ArthropodesTest" +my $racine = ""; # par exemple "Arthropodes" +my $metadata = ""; # par exemple "metadata.csv" +my $sortie = ""; # par exemple "Arthropodes_Ira.txt" + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "dir=s" => \$dir, + "racine=s" => \$racine, + "metadata=s" => \$metadata, + "sortie=s" => \$sortie, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $dir or not $racine or not $metadata or not $sortie; + +#ouverture du dossier contenant les fichier .txt + +opendir(my $dh, $dir) or die "Couldn't open file $dir, $!"; +my @files = sort grep { /^${racine}_\d+.txt$/ and -f "$dir/$_" } readdir($dh); +closedir($dh); +my $size = @files; +print STDERR "Total : $size\n"; + +#ouverture des métadonnées +open(METADATA, "<:utf8", $metadata) or die "Couldn't open file $metadata, $!"; + +#ouverture d'un fichier pour saisir des données +open(DATA, ">:utf8", $sortie) or die "Couldn't open file $sortie, $!"; + +#une boucle pour métadonnées, une autre pour les fichiers .txt dans le dossier ArthropodesTest +my $i = 0; +while (my $metadata = ) { + $metadata =~ s/"(.*?)","(.*?)","(.*?)","(.*?)"/\*\*\*\* \*s_$2 \*c_$3 \*annee_$4/; + print DATA "$metadata"; + open(FILE, "<:utf8", "$dir/$files[$i]") or die "Couldn't open file $files[$i], $!"; + while(my $ligne = ){ + #suppression des étoiles et des chiffres + $ligne =~ s/\*//; + $ligne =~ s/\d//g; + print DATA $ligne; + } + close(FILE); + $i++; +} + +close(METADATA); +close(DATA); + +exit 0; + + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -d répertoire -r racine -m métadonnées -s sortie\n\n"; +print STDERR "Exemple : \n"; +print STDERR " $programme -d ArthropodesTest -r Arthropodes -m metadata.csv -s Arthropodes_Ira.txt\n"; + +exit $code; +} \ No newline at end of file diff --git a/Niveau-3/TXM/Extraction_Corps_TEI.pl b/Niveau-3/TXM/Extraction_Corps_TEI.pl new file mode 100755 index 0000000..73b226c --- /dev/null +++ b/Niveau-3/TXM/Extraction_Corps_TEI.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; +use Encode; +use Getopt::Long; +use XML::Twig; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $input = ""; +my $output = ""; + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "input=s" => \$input, + "output=s" => \$output, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $input or not $output; + +opendir(my $dh, $input) or die "Couldn't open directory $input, $!"; + +my @files = sort grep { /tei$/ and -f "$input/$_" } readdir($dh); +closedir($dh); + +my $size = @files; +print STDERR "Total : $size\n"; + +#il faut écrire "$dir/$file", pour entrer dans le file !!! +foreach my $file (@files) +{ + my $new_nom = $file; + $new_nom =~ s/\.tei/\.txt/; + print STDERR "$file => $new_nom\n"; + open(TXT, ">:encoding(UTF-8)", "$output/$new_nom") or + die "Couldn't open file $new_nom, $!"; + + +#utiliser la module Twig ; prendre le contenu dans le balise p (le texte) + my $parser = XML::Twig->new + ( + twig_handlers => + { + 'text/body/div/p' => sub { + # my $text = $_->first_child_text; + print TXT $_->first_child_text. "\n"; + } + } + ); + $parser->parsefile("$input/$file"); + + close(TXT); +} + +exit 0; + +sub usage +{ +my $code = shift; +print STDERR "Usage: $programme -i input -o output\n"; +exit $code; +} \ No newline at end of file diff --git a/post_traitement_corpus/rename_file.pl b/post_traitement_corpus/rename_file.pl new file mode 100644 index 0000000..14ef895 --- /dev/null +++ b/post_traitement_corpus/rename_file.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl +use strict; +use warnings; +use utf8; +use open qw/:std :utf8/; + +use Encode; +use Getopt::Long; + +my ($programme) = $0 =~ m|^(?:.*/)?(.+)|; +$programme = decode_utf8($programme); + +my $source = ""; #par exemple "Vieil_v2.source" +my $dir = ""; #par exemple "Vv2_new" + +eval { + $SIG{__WARN__} = sub {usage(1);}; + GetOptions( + "source=s" => \$source, + "dir=s" => \$dir, + ); + }; +$SIG{__WARN__} = sub {warn $_[0];}; + +usage(2) if not $dir or not $source; + +open(SOURCE, "<:encoding(UTF-8)", $source) +or die "Couldn't open file $source, $!"; + +my @sourcefile = ; +my $size = @sourcefile; +print STDERR $size; +close SOURCE; + +my %correspondance; +for(my $n=0;$n<@sourcefile;$n++){ + chomp($sourcefile[$n]); + $sourcefile[$n] =~ /(\w{40})(\W*?)(\w{11})/; + $correspondance{$1}=$3; +} + +#print "$_ $correspondance{$_}\n" for (keys %correspondance);#imprimer une table d'hachage + +my $size_cles = keys %correspondance; +print STDERR $size_cles; + +opendir(DIR, $dir) or die "Couldn't open the directory $dir, $!"; + my @files = readdir(DIR); + closedir(DIR); + +foreach my $file (@files){ + my $new = $file; + $new =~ s/(\w{40}).txt/$1/; + $new = $correspondance{"$new"}; + $new =~ s/$/.txt/; + rename("$dir/$file","$dir/$new"); + print "$file => $new\n"; +} + +exit 0; + +sub usage +{ +my $code = shift; + +print STDERR "Usage : $programme -s source -d dir\n"; + +exit $code; +} + + +