Newer
Older
explore-corpus / Niveau-0 / Extraction_Infos_Depuis_wos.pl
@Pan Pan Hu Pan Pan Hu on 11 Aug 2017 1 KB 6commits
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use open qw/:std :utf8/;

use Getopt::Long;

my ($programme) = $0 =~ m|^(?:.*/)?(.+)|;

my $input  = "";
my $output = "metadata.txt";

eval	{
	$SIG{__WARN__} = sub {usage(1);};
		GetOptions(
		"input=s"  => \$input,
		"output=s" => \$output,
		);
	};
$SIG{__WARN__} = sub {warn $_[0];};

usage(2) if not $input;

open(WOS, "<:utf8", $input) or die "Couldn't open file \"$input\", $!";
open(META, ">:utf8", $output) or die "Couldn't open file \"$output\", $!";

print META "TI;SO;LA;DT;AB;PU;J9;PY;SC;UT\n";

my @valeurs = ();

foreach my $ligne (<WOS>)
{
	chomp($ligne);
	$ligne =~ s/\r//go;
	if ($ligne =~ /^TI (.*)/o)
		{
		$valeurs[0] = "$1";
		# équivalent à
		# $valeurs[0] = substr($ligne, 3);
		}
	elsif ($ligne =~ /^SO (.*)/o)
		{
		$valeurs[1] = "$1";
		}
	elsif ($ligne =~ /^LA (.*)/o)
		{
		$valeurs[2] = "$1";
		}	
	elsif ($ligne =~ /^DT (.*)/o)
		{
		$valeurs[3] = "$1";
		}
	elsif ($ligne =~ /^AB (.*)/o)
		{
		$valeurs[4] = "$1";
		}
	elsif ($ligne =~ /^PU (.*)/o)
		{
		$valeurs[5] = "$1";
		}
	elsif ($ligne =~ /^J9 (.*)/o)
		{
		$valeurs[6] = "$1";
		}
	elsif ($ligne =~ /^PY (.*)/o)
		{
		$valeurs[7] = "$1";
		}
	elsif ($ligne =~ /^SC (.*)/o)
		{
		$valeurs[8] = "$1";
		}
	elsif ($ligne =~ /UT ISTEX:(.*)/o)
		{
		my $lien = $1;
		$valeurs[9] = "<a href=\"https://api.istex.fr/document/".$lien."/fulltext/pdf?sid=scodex\" target=\"_blank\">".$lien."</a>";
		}
	elsif ($ligne =~  /^ER/o)
		{
		foreach my $valeur (@valeurs)
			{
			if ($valeur =~ /[,;"]/o)#pour proteger les valeurs au format CSV
				{
				$valeur =~ s/"/""/go;
				$valeur = '"' . $valeur . '"';
				}
			}
		print META join(";", @valeurs), "\n";
		@valeurs = ();
		}	
}
close WOS;
close META;


exit 0;

sub usage
{
my $code = shift;

print  "Usage : $programme -i input [ -o output ]\n";

exit $code;
}