#!/usr/bin/env perl -- -*-Perl-*- # UPDATE HISTORY: # mai2sgml.pl: First created by Satoshi Sekine in 1999. # mai2ntc.pl: Customized by NTCIR group in 2001. # mai2ntc-r.pl: Customized by NTCIR group. # jperl mai2ntc-r.pl # ex. jperl mai2ntc-r.pl mai2000.txt ntc5-j-mai00.txt er-mai00.txt # use I18N::Japanese; $ad{"01"} = "1面"; $ad{"02"} = "2面"; $ad{"03"} = "3面"; $ad{"04"} = "解説"; $ad{"05"} = "社説"; $ad{"07"} = "国際"; $ad{"08"} = "経済"; $ad{"10"} = "特集"; $ad{"12"} = "総合"; $ad{"13"} = "家庭"; $ad{"14"} = "文化"; $ad{"15"} = "読書"; $ad{"16"} = "科学"; $ad{"18"} = "芸能"; $ad{"35"} = "スポーツ"; $ad{"41"} = "社会"; $tag{"C0"} = "DOCNO"; $tag{"AD"} = "SECTION"; $tag{"AE"} = "AE"; $tag{"S1"} = "WORDS"; $tag{"T1"} = "HEADLINE"; $tag{"T2"} = "TEXT"; $newtag{"LANG"} ="LANG"; $lang = "JA"; $newtag{"AF"} = "DATE"; sub zen2han($) { $_[0] =~ tr/ !”#$%&’()*+,−./0-9:;<=>?@A-Z[¥]^― a-z{|} ̄ / !-~/; $_[0]; } sub transfer($$) { my ( $key, $context ) = @_; my $data; if ( $key eq 'AF' ) { $data = zen2han( $context ); $hyear = substr($data,0,2); if ($hyear < 50) { $date = "20".substr($data, 0, 2)."-".substr($data, 2, 2)."-".substr($data, 4, 2); } else { $date = "19".substr($data, 0, 2)."-".substr($data, 2, 2)."-".substr($data, 4, 2); } } elsif ( $key eq 'C0' ) { $data = zen2han( $context ); # if ($hyear < 50) { # $data = "20".$data; # } # else { # $data = "19".$data; # } # $year = substr($data, 0, 2 ); } elsif ( $key eq 'AE' ) { $data = ( $context eq 'Y' ) ? '有' : '無' ; } elsif ( $key eq 'S1' ) { my $size; ( $size ) = /.*(全(.*)文字)/; $data = zen2han( $size ); } elsif ( $key eq 'AD' ) { $data = $ad{zen2han($context)} } else { $data = $context; } $data; } sub output { my $key; $txt = join("\n",@{$keyword{'T2'}}); if (($txt =~/著作権交渉中/) && ($txt =~/表示できません/)) { print ER "\n"; foreach $key ( 'C0' ) { print ER "<", $tag{$key}, ">", $lang, "-", $keyword{$key}->[0], "\n"; } foreach $key ( 'LANG' ) { print ER "<", $newtag{$key}, ">", $lang, "\n"; } foreach $key ( 'AD', 'AE', 'S1', 'T1' ) { print ER "<", $tag{$key}, ">", $keyword{$key}->[0], "\n"; } foreach $key ( 'AF' ) { print ER "<", $newtag{$key}, ">", $date, "\n"; } foreach $key ( 'T2' ) { print ER "<",$tag{$key},">\n", join("\n",@{$keyword{$key}}), "\n\n"; } print ER "\n"; print ER "\n"; } else { print OUT "\n"; foreach $key ( 'C0' ) { print OUT "<", $tag{$key}, ">", $lang, "-", $keyword{$key}->[0], "\n"; } foreach $key ( 'LANG' ) { print OUT "<", $newtag{$key}, ">", $lang, "\n"; } foreach $key ( 'AD', 'AE', 'S1', 'T1' ) { print OUT "<", $tag{$key}, ">", $keyword{$key}->[0], "\n"; } foreach $key ( 'AF' ) { print OUT "<", $newtag{$key}, ">", $date, "\n"; } foreach $key ( 'T2' ) { print OUT "<",$tag{$key},">\n", join("\n",@{$keyword{$key}}), "\n\n"; } print OUT "\n"; } } sub ls { my @dirs; my @dir; my $path = $_[0]; opendir(DIR,$path); @dirs = readdir(DIR); closedir(DIR); foreach $dir (@dirs) { (-l "$path/$dir") && next; if (($dir eq ".") || ($dir eq "..")) { next; } if ($dir =~/^.*\.txt$/) { push(@files,"$path/$dir"); } } } $infile = $ARGV[0]; $ofile = $ARGV[1]; $erfile = $ARGV[2]; if (-d $infile) { &ls($infile); } else { push(@files,$infile); } open(OUT,">$ofile") || die "cannot open $ofile"; open(ER,">$erfile") || die "cannot open $erfile"; foreach $file (@files) { open(IN, "nkf -e $file |") || die "cannot open $infile"; $first = 1; while () { chomp; ( $tag, $context ) = /\(.*)\(.*)/; $key = zen2han( $tag ); $data = transfer( $key, $context ); if ( $key eq "ID" ) { if ( $first == 1 ) { $first = 0; } elsif ( $first == 0 ) { output; undef %keyword; $first = -1; } else { print OUT "\n"; output; undef %keyword; } } $keyword{$key} = [] unless $keyword{$key}; push @{$keyword{$key}}, $data; } output; close(IN); } close(OUT); close(ER);