#!/usr/bin/perl =head1 NAME wsource2mets.pl =head1 DESCRIPTION Convert a Wikisource Index page to a METS/MODS file to display a digitized work in DFG-Viewer. THIS IS A BADLY HACKED PROOF-OF-CONCEPT ONLY! Author: Jakob Voss , Date: 2008-03-31 =cut use strict; use JSON; my ($author, $title, $location, $year); my $pagemode = 0; my @pagelines = (); while (<>) { chomp; my $l = $_; $l =~ s/^\s*//; last if $l =~ /^\}\}/; $pagemode = 0 if ($l =~ /^\|[A-Z]/); if ($pagemode) { push @pagelines, $l; } elsif ($l =~ /^\|AUTOR=(.*)/) { $author = $1; $author =~ s/['[\]]//g; } elsif ($l =~ /^\|TITEL=(.*)/) { $title = $1; $title =~ s/['[\]]//g; } elsif ($l =~ /^\|JAHR=(.*)/) { $year = $1; } elsif ($l =~ /^\|ORT=(.*)/) { $location = $1; } elsif ($l =~ /^\|SEITEN=/) { $pagemode = 1; } else { # print "$l\n"; } } my $dmdsecid = "md123"; my $logmapid = "log123"; my $amdsecid = "amd123"; my $physid = "phys-123"; my $rightsid = "rights123"; print < $title text $location $year Wikisource http://upload.wikimedia.org/wikisource/de/b/bc/Wiki.png http://de.wikisource.org XMLDATA my @images = (); foreach my $l (@pagelines) { # TODO: Struktur auslesen (Titel, Vorwort, Gliederung...) next unless ($l =~ /^\[\[Seite:([^|]+)\|(.*)\]\]/ ); my ($page, $label) = ($1, $2); $label =~ s/<\/?[^>]+>//g; # HTML-tags entfernen # $page =~ s/ /_/g; push @images, { label => $label, page => $page }; } my $iiurlwidth = 600; my @titles = (); for(my $id=0; $id<@images; $id++) { my $title = "Image:" . $images[$id]->{page}; push @titles, $title; } use LWP::Simple; my $url = 'http://de.wikisource.org/w/api.php?format=json&action=query&prop=imageinfo&iiprop=url&iiurlwidth=' . $iiurlwidth . '&titles=' . join('|',@titles); my $json = get($url); my $obj = decode_json $json; my %imgurls = (); my %pages = %{ $obj->{query}->{pages} }; foreach my $p (values %pages) { my %imageinfo = %{ shift @{ $p->{imageinfo} } }; $imgurls{ $p->{title} } = $imageinfo{thumburl}; } print STDERR "Extracted " . (keys %imgurls) . " image URLs\n"; for(my $id=0; $id<@images; $id++) { my %img = %{ $images[$id] }; my $title = "Bild:" . $img{page}; print STDERR "Missing: $title\n" unless defined $imgurls{$title}; $images[$id]->{url} = $imgurls{$title}; } use Data::Dumper; print STDERR Dumper(%imgurls) . "\n"; print "\n"; for(my $id=0; $id<@images; $id++) { my %img = %{ $images[$id] }; my $imgurl = $img{url}; print " \n"; # TODO: mime-type print " \n"; print " \n"; } print "\n"; print "\n"; print "\n"; print "\n"; print "\n"; print "\n"; print " \n"; for(my $id=0; $id<@images; $id++) { print " \n" . " \n"; } print " \n"; print " \n"; print " \n"; print " \n"; print " \n"; print "\n"; #print "Author: $author\n"; #print "Title: $title\n"; #print "Year: $year\n"; #print "Location: $location\n";