#!/usr/bin/perl # doajarticles2mylibrary.pl - harvest DOAJ articles and import them into MyLibrary # Eric Lease Morgan # 2007-07-02 - added authors as a facet and individual creators as a term # 2007-07-01 - really got it going # 2007-06-30 - first cut; based on doaj2mylibrary.pl # define where the OAI interface to DOAJ is use constant DOAJ => 'http://www.doaj.org/oai.article'; # include the necessary modules/subroutines use MyLibrary::Core; use Net::OAI::Harvester; use strict; require 'subroutines.pl'; # configure MyLibrary::Config->instance( 'articles' ); binmode (STDOUT, ':utf8'); # get and/or create locations types, facets, and terms my $location_type = &make_or_get_location_type( 'URL', 'Internet pointers' ); my $formatID = &make_or_get_facet_id( 'Formats', 'A list of physical formats for information resources' ); my $articleTermID = &make_or_get_term_id( 'Articles', 'Scholarly papers usually five or six pages long', $formatID ); #my $keywordsID = &make_or_get_facet_id( 'Keywords', 'Subject-like headings; think "tags"' ); my $publishersID = &make_or_get_facet_id( 'Publishers', 'Distributors of data and information' ); my $sourcesID = &make_or_get_facet_id( 'Sources', 'Usually journal titles' ); #my $authorsID = &make_or_get_facet_id( 'Authors', 'People who create data and information' ); my $repositoriesID = &make_or_get_facet_id( 'Repositories', 'OAI data collections' ); my $doajarticlesID = &make_or_get_term_id( 'DOAJ Articles', 'http://www.doaj.org/oai.article', $repositoriesID ); # create specific facets/terms for DOAJ my $themeID = &make_or_get_facet_id( 'Themes', 'Over-arching and high-level subject terms' ); my $harvester = Net::OAI::Harvester->new( 'baseURL' => DOAJ ); my $sets = $harvester->listSets; foreach ($sets->setSpecs) { &make_or_get_term_id( $sets->setName($_), 'This theme comes from the DOAJ', $themeID ) } # loop through each OAI set my $index = 0; foreach ($sets->setSpecs) { next if ( $_ eq 'Agriculture_and_Food_Sciences' ); next if ( $_ eq 'Arts_and_Architecture' ); next if ( $_ eq 'Biology_and_Life_Sciences' ); next if ( $_ eq 'Health_Sciences'); # get this set name print "\n$_\n"; my $term = MyLibrary::Term->new( name => $sets->setName( $_ )); my $termID = $term->term_id; # get the records in this set my $records = $harvester->listAllRecords( metadataPrefix => 'oai_dc', set => $_ ); # process each record while ( my $record = $records->next ) { # increment $index++; # extract the metadata my $FKey = $record->header->identifier; my $metadata = $record->metadata; my $name = $metadata->title; my @creators = $metadata->creator; my $note = $metadata->description; my $publisher = $metadata->publisher; next if ( ! $publisher ); my $location = $metadata->identifier; next if ( ! $location ); my $date = $metadata->date; my $source = $metadata->source; my @subjects = $metadata->subject; # display print " record: $index\n"; print " fkey: $FKey\n"; print " name: $name\n"; foreach ( @creators ) { print " creator: $_\n" } print " note: $note\n"; print " publisher: $publisher\n"; print " location: $location\n"; print " source: $source\n"; print " date: $date\n"; foreach ( @subjects ) { print " subject: $_\n" } print " action: "; # check to see if it already exits if (! MyLibrary::Resource->new( fkey => $FKey )) { # add terms my $source_id = &make_or_get_term_id ( $source, '', $sourcesID ); my $publisher_id = &make_or_get_term_id ( $publisher, '', $publishersID ); #my @keyword_ids = (); foreach ( @subjects ) { push @keyword_ids, &make_or_get_term_id ( $_, '', $keywordsID ) } #my @author_ids = (); foreach ( @creators ) { push @author_ids, &make_or_get_term_id ( $_, '', $authorsID ) } # create it my $resource = MyLibrary::Resource->new; $resource->fkey( $FKey ); $resource->name( $name ); my $creator = ''; foreach ( @creators ) { $creator .= "$_|" } $resource->creator( $creator ); $resource->note( $note ); $resource->publisher( $publisher ); $resource->source( $source ); $resource->date( $date ); my $subject = ''; foreach ( @subjects ) { $subject .= "$_|" } $resource->subject( $subject ); #$resource->related_terms( new => [ @author_ids, @keyword_ids, $source_id, $publisher_id, $doajarticlesID, $articleTermID, $termID ]); $resource->related_terms( new => [ $source_id, $publisher_id, $doajarticlesID, $articleTermID, $termID ]); $resource->add_location( location => $location, location_type => $location_type ); $resource->commit; print "added (", $resource->id, ").\n"; } else { # update it my $resource = MyLibrary::Resource->new( fkey => $FKey ); $resource->related_terms( new => [ $termID ]); $resource->commit; print "already exists. Updated.\n"; } # make things pretty print "\n"; } } # done print "\nDone\n"; exit; ############# # subroutines sub make_or_get_facet_id { # get the input my $name = shift; my $note = shift; # initialize my $facet = MyLibrary::Facet->new; # check for the name if (! MyLibrary::Facet->new( name => $name )) { # create it $facet->facet_name( $name ); $facet->facet_note( $note ); $facet->commit; #print "The facet $name ($note) has been created.\n"; } else { # already exists $facet = MyLibrary::Facet->new( name => $name ); #print "The facet $name already exists.\n"; } # done return $facet->facet_id; } sub make_or_get_term_id { # get the input my $name = shift; my $note = shift; my $facet_id = shift; # initialize my $term = MyLibrary::Term->new; # check for the name if (! MyLibrary::Term->new( name => $name )) { # create it $term->term_name( $name ); $term->term_note( $note ); $term->facet_id( $facet_id ); $term->commit; #print "The term $name ($note) has been created.\n"; } else { # already exists $term = MyLibrary::Term->new( name => $name ); #print "The term $name already exists.\n"; } # done return $term->term_id; } sub make_or_get_location_type { # get input my $name = shift; my $note = shift; # initialize my $location_type; # see if it exists foreach ( MyLibrary::Resource::Location::Type->all_types ) { my $type = MyLibrary::Resource::Location::Type->new( id => $_ ); if ( $type->name eq $name ) { $location_type = $type->location_type_id; #print "Location type $name exists\n"; last; } } # if not, then create it if ( ! $location_type ) { my $type = MyLibrary::Resource::Location::Type->new; $type->name( $name ); $type->description( $note ); $type->commit; $location_type = $type->location_type_id; #print "Location type $name was created\n"; } # done return $location_type; }