#!/usr/bin/perl # $AFresh1: errata_scraper.pl,v 1.1 2011/03/21 16:26:58 andrew Exp $ ######################################################################## # Copyright (c) 2011 Andrew Fresh # # Permission to use, copy, modify, and distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ######################################################################## use strict; use warnings; use Mojo::Client; my $client = Mojo::Client->new; $client->get( 'http://www.openbsd.org/errata48.html' => sub { shift->res->dom('li')->each( sub { my $e = shift; my $patch = $e->at('a[href$=".patch"]')->attrs->{href}; my $title = $e->at('strong')->replace('')->all_text; $title =~ s/\s+/ /gxms; my $arch = $e->at('i')->replace('')->all_text; $arch =~ s/\s+/ /gxms; # the li ends at p, but the parser expects a /li $e->at('p')->replace(''); my $descr = $e->all_text; $descr =~ s/\s+/ /gxms; $descr =~ s/^\s+|\s+$//gxms; print 'Title: ', $title, "\n"; print 'Arch: ', $arch, "\n"; print 'Patch: ', $patch, "\n"; print 'Descr: ', $descr, "\n"; print "\n"; } ); } ); $client->start;