#!/usr/bin/perl use strict; use warnings; use open IO => ':bytes'; use Encode qw'decode encode'; use Pod::Usage qw'pod2usage'; use Getopt::Long qw':config no_auto_abbrev no_getopt_compat no_ignore_case no_bundling auto_version auto_help'; ## global constants my @version = (1,0,2); my $xml_indent = "\t"; my $xml_badchar_substitute = " "; ## xml markup template generator sub make_xml_templates # Return a reference to a hash containing xml template strings # used to assemble a MARCXML file. Boolean args (in order): # * pretty formatting / indentation # * add explicit namespace ("marc" by default) to tags # * add xml header and root element { my $indent = shift; my $namespace = shift; my $header = shift; # evaluate arguments my $newline = $indent?"\n":""; $indent = $indent?$xml_indent:""; $namespace = defined($namespace)?($namespace or "marc"):""; my $nscolon = $namespace?":":""; # two helper functions to create templates my $mkstart = sub # args: indent, name+attrs, final_newline? { return sprintf('%s<%s%s%s>%s',$indent x $_[0],$namespace, $nscolon,$_[1],$_[2]?$newline:""); }; my $mkend = sub # args: indent, name { return sprintf('%s%s',$indent x $_[0],$namespace, $nscolon,$_[1],$newline); }; # assemble templates my $xml = { header => [ sprintf('%s%s<%s%scollection xmlns%s%s="%s" %s %s>%s', '', $newline, $namespace, $nscolon, $nscolon, $namespace, 'http://www.loc.gov/MARC21/slim', 'xmlns:xsi='. '"http://www.w3.org/2001/XMLSchema-instance"', 'xsi:schemaLocation="'. 'http://www.loc.gov/MARC21/slim'. ' '. 'http://www.loc.gov/standards/'. 'marcxml/schema/MARC21slim.xsd'. '"', $newline, ), &$mkend(0,'collection'), ], record => [&$mkstart(1,'record',1),&$mkend(1,'record')], leader => [&$mkstart(2,'leader',0),&$mkend(0,'leader')], controlfield => [ &$mkstart(2,'controlfield tag="%03d"'), &$mkend(0,'controlfield'), ], datafield => [ &$mkstart(2, 'datafield tag="%03d" ind1="%1s" ind2="%1s"',1), &$mkend(2,'datafield'), ], subfield => [ &$mkstart(3,'subfield code="%1s"'), &$mkend(0,'subfield'), ], }; # clear header template if no header is requested $xml->{header} = ["",""] unless ($header); return $xml; } ## core record parser and dumper class {package MARC21Record; # A class that reads, parses and # dumps (as xml) MARC 21 records. # Each step is encapsulated in its own function and # stores messages in the attributes 'warning' # and/or 'error' if such a thing occurs. # Then the caller may decide on how to proceed. # # The attribute structure of this class reads as follows: # { # "record" => , # "leader" => <24 character leader substring of the record>, # "dirstring" => , # "dataarea" => , # "directory" => [ # { # "tag" => , # "length" => , # "address" =>
, # }, ... # ], # "controlfields" => [ # { # "tag" => <3 digits number in the range 0--9 !!>, # "content" => , # }, ... # ], # "datafields" => [ # { # "tag" => <3 digits number in the range 10--999 !!>, # "ind1" => , # "ind2" => , # "subfields" => [ # { # "code" => , # "content" => , # },... # ], # },... # ], # "error" => , # "warning" => , # } # # Note on comments in sub heads: argument numbers do *not* # include $this, so "arg 1" is obtained with the *second* shift. sub new # Initialize the object, read the record from # the filehandle (arg 1) via read_record. # Return the blessed object, or undef if # EOF is hit while reading the first byte. { my $classname = shift; my $filehandle = shift; my $this = { record=>undef, leader=>undef, dirstring=>undef, dataarea=>undef, directory=>[], controlfields=>[], datafields=>[], error=>undef, warning=>undef, }; bless $this,$classname; $this->read_record($filehandle); return ($this->{record}?$this:undef); } sub set_error # Unless the condition (arg 1) is true or the # attribute is already occupied, store the # error message (arg 2) in the 'error' attribute. # Return the state of the attribute (1 or 0). { my $this = shift; my $cond = shift; my $msg = shift; $this->{error} = $msg unless ($cond or $this->{error}); return defined($this->{error}); } sub set_warning # Unless the condition (arg 1) is true or the # attribute is already occupied, store the # warning message (arg 2) in the 'warning' attribute. # Return the state of the attribute (1 or 0). { my $this = shift; my $cond = shift; my $msg = shift; $this->{warning} = $msg unless ($cond or $this->{warning}); return defined($this->{warning}); } sub get_controlfield # Find and return first content of first occurrence # of the controlfield with the given tag (arg 1), # or undef if it doesn't exsit (yet). # This is mainly used to aid in error reporting. { my $this = shift; my $tag = shift; for my $field (@{$this->{controlfields}}) { return $field->{content} if $field->{tag}==$tag; } return; } sub read_record # Read the record from the given file handle (arg 1) # and store it in the 'record' attribute. # If no bytes can be read, no error is reported, # but 'record' won't be touched. { my $this = shift; my $filehandle = shift; return if defined($this->{error}); # first try to read the size (5 bytes), store whatever comes return unless read($filehandle,my $length,5); $this->{record} = $length; # check if the size makes sense return if $this->set_error(scalar($length=~m'\d{5}'), "invalid record length"); return if $this->set_error(scalar($length>5), "record is too small"); # attempt to read the rest, check for completeness read($filehandle,my $record,$length-5); $this->{record} .= $record; $this->set_error(length($this->{record})==$length, "failed to read complete record"); return; } sub parse_leader # Fill 'leader', 'dirstring' and 'dataarea', # also perform some sanity checks on the leader. { my $this = shift; return if defined($this->{error}); # fill 'leader' attribute, then extract interesting fields $this->{leader} = substr($this->{record},0,24); my @values = ($this->{leader}=~ m'^(\d{5})....(.)(..)(\d{5})...(...)(.)$'); return if $this->set_error(scalar(@values), "malformed leader"); my %ldr; @ldr{(qw'length coding is_cnt data_addr lsi_cnt undef')} = @values; # check fields for sanity # (see # or ) $this->set_error($ldr{data_addr}<$ldr{length}, "invalid data address (beyond record length)"); $this->set_error($ldr{data_addr}>=25, "invalid data address (within leader)"); return if defined($this->{error}); $this->set_warning($ldr{coding} eq 'a', "character coding is not UTF-8"); $this->set_warning(scalar($ldr{is_cnt}=~m'[2 ][2 ]'), "indicator and/or subfield count is not '2'"); $this->set_warning(scalar($ldr{lsi_cnt}=~m'[4 ][5 ][0 ]'), "portions lengths violate specification"); $this->set_warning($ldr{undef} eq '0', "undefined field is not '0'"); $this->set_warning( substr($this->{record},$ldr{data_addr}-1,1), "field terminator (0x1e) missing at end of directory"); $this->set_warning( substr($this->{record},$ldr{length}-1) eq "\x1d", "record terminator (0x1d) not found"); # fill remaining fields $this->{dirstring} = substr($this->{record},24, $ldr{data_addr}-25); $this->{dataarea} = substr($this->{record},$ldr{data_addr}); return; } sub parse_directory # Parse the 'dirstring' attribute, fill 'directory'. { my $this = shift; return if defined($this->{error}); # check length of directory return if $this->set_error(length($this->{dirstring})%12==0, "malformed directory"); # loop over 12-byte-blocks my $count = 0; # for error reporting for my $block ($this->{dirstring}=~m'.{12}'g) { my @data = ($block=~m'(\d{3})(\d{4})(\d{5})'); return if $this->set_error(scalar(@data), "directory entry #$count malformed"); my %entry; @entry{(qw'tag length address')} = @data; push @{$this->{directory}}, \%entry; $count++; } return; } sub parse_subfields # Parse the subfields of a given datafield string (tag>9). # Return the array (reference thereof) # needed for the 'subfields' element. { # note: perlcritic dislikes undef as explicit return value, # but this suppresses an interpreter # warning in case of broken records. my $this = shift; my $str = shift; return undef if defined($this->{error}); ## no critic # each subfield starts with separator (0x1f) return undef ## no critic if $this->set_error(substr($str,0,1) eq "\x1f", "subfield separator (0x1f) mismatch/not found"); my @fields; my $count = 0; # for error reporting for my $block (split("\\\x1f",substr($str,1))) { # each subfield starts with the subfield code return undef if $this->set_error($block, ## no critic "subfield #$count: no subfield code"); push @fields, {code=>substr($block,0,1),content=>substr($block,1)}; $count++; } return \@fields; } sub parse_fields # Parse the 'dataarea' attribute, # fill 'controlfields' and 'datafields'. { my $this = shift; return if defined($this->{error}); my $count = 0; # for error reporting for my $entry (@{$this->{directory}}) { # warn if terminator character at end of block is missing my $sep = substr($this->{dataarea}, $entry->{address}+$entry->{length}-1,1); $this->set_warning($sep eq "\x1e", "field #$count (tag $entry->{tag}): ". "terminator mismatch/not found"); # also check special tag 8 (specs require 40 characters) $this->set_warning( ($entry->{tag}!=8) or ($entry->{length}==41), "field #$count (tag 008): ". "length is not 40 characters"); # control fields (tag<10) and data fields (tag>=10) # need different treatment if ($entry->{tag}<10) { my $field = { tag => $entry->{tag}, content => substr($this->{dataarea}, $entry->{address},$entry->{length}-1), }; push @{$this->{controlfields}},$field; } else { my $field = { tag => $entry->{tag}, ind1 => substr($this->{dataarea}, $entry->{address},1), ind2 => substr($this->{dataarea}, $entry->{address}+1,1), subfields => $this->parse_subfields( substr($this->{dataarea}, $entry->{address}+2,$entry->{length}-3)), }; # if an error occured, add the current data field's # number and tag to the message, then stop processing if (defined($this->{error})) { $this->{error} = "field #$count (tag $entry->{tag}): ". $this->{error}; return; } push @{$this->{datafields}},$field; } $count++; } return; } sub xml_escape # XML-escape '&', '<', '>', '"', # replace illegal characters in the given string with # $xml_badchar_substitute (see global constants). # (This is not a class method!) { my $s = shift; $s = main::decode("utf8",$s); $s =~ s/&/&/g; $s =~ s//>/g; $s =~ s/"/"/g; # " # (fix LaTeX code highlighting) # escape illegal characters as detailed here # $s =~ s/([^ \x{9} \x{a} \x{d} \x{20}-\x{d7ff} \x{e000}-\x{fffd} \x{10000}-\x{10ffff} ])/$xml_badchar_substitute/gex; $s = main::encode("utf8",$s); return $s; } sub dump_xml # Use the templates (arg 2, from make_xml_templates) to dump # the record to the given file handle (arg 1) as MARCXML. { my $this = shift; my $filehandle = shift; my $xml = shift; return if defined($this->{error}); print $filehandle $xml->{record}->[0], $xml->{leader}->[0], xml_escape($this->{leader}), $xml->{leader}->[1], ; for my $field (@{$this->{controlfields}}) { printf $filehandle $xml->{controlfield}->[0], $field->{tag}; print $filehandle xml_escape($field->{content}), $xml->{controlfield}->[1]; } for my $field (@{$this->{datafields}}) { printf $filehandle $xml->{datafield}->[0],$field->{tag}, xml_escape($field->{ind1}), xml_escape($field->{ind2}); for my $subfield (@{$field->{subfields}}) { printf $filehandle $xml->{subfield}->[0], xml_escape($subfield->{code}); print $filehandle xml_escape($subfield->{content}), $xml->{subfield}->[1]; } print $filehandle $xml->{datafield}->[1]; } print $filehandle $xml->{record}->[1]; return; } } ## command line interface sub process_command_line # Parse command line, return array with # * hash reference to xml templates # * verbosity (0,1,2) # * strictness (specs violations are errors) # * skip-file (skip broken records, write them to file) { # parse command line my %options; if (not GetOptions(\%options,( "strict", "skip=s", "header", "namespace=s", "indent", "verbose+", "version", "help+", ))) { print STDERR "Use '--help' for usage hints\n"; exit 2; } # handle secondary ("Miscellaneous") options if (defined($options{version})) { $0 =~ m'.*/([^/]+)'; print STDERR "$1 ",join(".",@version),"\n"; exit; } if (defined($options{help})) { pod2usage(-verbose=>8) if ($options{help}>1); pod2usage(-verbose=>99,-sections=>['OPTIONS']); } # get xml templates, return my $xml = make_xml_templates( @options{(qw'indent namespace header')}); return ($xml,($options{verbose} or 0), @options{(qw'strict skip')}); } sub main { my ($xml,$verbose,$strict,$skip) = process_command_line(); my $status = 0; # proposed exit status, indicates abortion my $count; # for progress/warning/error reporting my $record; # holds the current record instance my $skipfh; # file handle for skip file # if $strict is set, warnings can't be hidden $verbose = $verbose>1?$verbose:1 if $strict; # open skip file if ($skip) { # perlcritic says: close filehandle as soon as possible; # however, the filehandle is still needed later if (not open $skipfh, ">>", $skip) ## no critic { print STDERR "error: failed to open '$skip'\n"; return 4; } } # two helper functions to check record status my $record_msg = sub # Prefix the given message (arg 1) with a # short string, which identifies the current record, # then send it to stderr. { my $msg = shift; if ($record) { my $cf = $record->get_controlfield(1); $msg = sprintf("record %9s%s: %s", "#$count",$cf?" [tag001=$cf]":"",$msg); } print STDERR $msg, "\n"; return; }; my $record_report_skip = sub # Evaluate the warning/error attributes of the current record. # Print error or warning message if appropriate. # Update $status to indicate skipping or abortion. # Also indicate skipping of record with return value 1. # Errors don't cause abortion if $skip is true; # to enforce abortion (after read_record), set arg 1 to 1. # Note: Warnings are ignored (even with $strict, unless arg 2 # is 1) to defer warning messages until tag 001 got parsed. { my $abort = shift; # force abort on error, even with $skip my $warn = shift; # print warning, even if harmless # check and print warning my $warning = $record->{warning}; my $error = $record->{error}; $warn ||= defined($error); $warn &&= defined($warning) && $verbose; &$record_msg("warning: $warning") if $warn; return unless (defined($error) or ($warn and $strict)); # at this point, the record will certainly be skipped &$record_msg("error: $error") if defined($error); &$record_msg("skipped"); print $skipfh $record->{record} if ($skipfh and not $abort); $status = $abort?3:defined($skip)?1:2; return 1; }; # core loop $count = 0; print STDOUT $xml->{header}->[0]; while ($record=MARC21Record->new(*STDIN)) { next if &$record_report_skip(1,0); # parse leader $record->parse_leader(); &$record_msg(sprintf("%5d bytes, leader: '%s'", length($record->{record}),$record->{leader})) if $verbose>2; next if &$record_report_skip(0,0); # parse directory $record->parse_directory(); next if &$record_report_skip(0,0); &$record_msg(sprintf("tags: %s", join(" ",map({$_->{tag}} @{$record->{directory}})))) if $verbose>2; # parse data fields $record->parse_fields(); next if &$record_report_skip(0,1); # final log message &$record_msg(sprintf( "%5d bytes, %3d control field(s), %3d data field(s)", length($record->{record}), scalar(@{$record->{controlfields}}), scalar(@{$record->{datafields}}), )) if $verbose>1; # dump xml $record->dump_xml(*STDOUT,$xml); } continue { if ($status>1) { print STDERR "aborting conversion\n"; last; } $count++; } print STDOUT $xml->{header}->[1]; close $skipfh if $skip; return $status; } exit(main()); ################################################################ __END__ =head1 NAME marc21-marcxml - convert MARC 21 records to MARCXML =head1 SYNOPSIS marc21-marcxml [options] =head1 DESCRIPTION Read one or several concatenated MARC 21 records from standard input and write corresponding MARCXML records to standard output. =head1 OPTIONS =head2 Record Processing =over =item C<--strict> Abort conversion if a record violates MARC 21 specifications (as far as known to this program). By default, such violations trigger warnings, which aren't visible unless C<--verbose> is used. =item C<--skip> {F|''} Don't abort conversion if a record causes an error (possibly due to C<--strict>), skip the record instead. By default, conversion is aborted whenever a record cannot be processed. This option expects the path to a file as argument; skipped records will be appended to this file. To simply drop skipped records, provide the empty string as argument (possibly a pair of apostrophes, depending on your shell). Note that conversion is still aborted when the record length (first five bytes of the MARC leader) cannot be parsed. =back =head2 XML Generation =over =item C<--header> Add an XML declaration and C root element to the output. If this option is ommitted, only bare C elements will be written to standard output. This allows for easy concatenation of the outputs of several invocations, but the result likely needs to be enclosed in a C element for further processing. =item C<--namespace> {I|''} Explicitely define and use the given namespace for all XML elements (C if the empty string is provided as argument). If ommitted, a default namespace is set in the C root element. =item C<--indent> Add indentation to XML output, so as to make it more human-readable. =back =head2 Miscellaneous Options =over =item C<--verbose> Print warnings to standard error, in addition to error messages. Use twice to print a summary line for each record processed, and three times to print additional debug information about each record. =item C<--version> Print version information and exit. =item C<--help> Print usage information and exit. Use twice to obtain a short manual page. =back =head1 EXIT STATUS =over =item C<4> The file given to the C<--skip> option could not be opened for appending. =item C<3> Conversion was aborted because a MARC 21 record could not be read completely (possibly due to a broken leader). =item C<2> Conversion was aborted while a record was being processed, due to an error or due to C<--strict>. =item C<1> At least one record was skipped. =item C<0> All records have been converted successfully. =back =head1 AUTHOR Written by Wolfgang Boiger (L) as part of a librarian clerkship assignment. =head1 COPYRIGHT Copyright (C) 2015 Wolfgang Boiger, Muenchen. Licence AGPL3+: GNU Affero GPL version 3 or later L. This is free software; you can use it, redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This software is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.