BPLinkBot/Source
From Beachapedia
< User:BPLinkBot
Dependencies
BPLinkBot requires the following perl modules to be installed on the system. Some of these may already be installed (core modules) but need to be upgraded to the latest versions. The packages can be obtained from the CPAN website
- MediaWiki::API
- JSON
- Crypt::SSLeay
- Digest::MD5
- Test::Simple
- MediaWiki::Bot
Source Code
This is current as of Version 1.1 (Jan 8, 2011)
#!/usr/bin/perl use MediaWiki::API; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; use MediaWiki::Bot; my $mw = MediaWiki::API->new(); $mw->{config}->{api_url} = 'http://www.beachapedia.org/api.php'; $mw->login( { lgname => 'BPLinkBot', lgpassword => 'REDACTED' } ) || die $mw->{error}->{code} . ': ' . $mw->{error}->{details}; binmode STDOUT, ':utf8'; # Get a list of page Titles from the Wiki: $mw->list ({ action => 'query', list => 'allpages'}, { max => 5000, hook => \&article_list }); sub article_list { my ($ref) = @_; foreach (@$ref) { push @titles, $_->{title}; } } print "Found " . scalar(@titles) . " on the wiki!\n"; %links = (); # Get a list of external links for each page Title # and build the database: # keys are URLs, values are pipe-delimited lists of page titles foreach $title (@titles) { my $pagelinks = $mw->api({ action => 'query', prop => 'extlinks', titles => $title }, { max => 5000 }); my ($id,$extlinks) = each( %{ $pagelinks->{query}->{pages} }); foreach ( @{ $extlinks->{extlinks} } ) { if (exists $links{"$_->{'*'}"}) { $links{"$_->{'*'}"} = $links{"$_->{'*'}"} . '|' . $title; } else { $links{"$_->{'*'}"} = $title; } } } print scalar(keys %links), " links found to external sites.\n\n"; # Logout using API module $mw->logout(); # Switch to Bot module my $bot = MediaWiki::Bot->new({ protocol => 'http', host => 'www.beachapedia.org', path => '', }); $bot->login({ username => "BPLinkBot", password => 'REDACTED', }); # Clear Badpages hash: %badpages = (); # Check each URL: foreach my $url (keys %links) { if ($url =~ m/^mailto:/) { next; # Skip email links } if ($url =~ m[^http://scc.ca.gov/]) { next; # CCC links always test as bad } my @pagesusingURL = split /\|/, $links{"$url"}; ($rawurl, $bookmark) = split /\#/, $url; # We don't want bookmarks confusing the issue my $ua = LWP::UserAgent->new(); $ua->agent("Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13"); #pretend to be a mac version of Firefox my $req = HTTP::Request->new(GET => $rawurl); $req->referer("http://www.beachapedia.org"); # Tell them where we were referred from my $response = $ua->request($req); if ($response->is_error()) { foreach my $page (@pagesusingURL) { $badpages{"$page"} = '1'; my $talkpagetext = $bot->get_text("Talk:$page"); $talkpagetext .= "\n\nFound Bad Link: $url\n\n--\~\~\~\~"; $bot->edit({ page => "Talk:$page", text => $talkpagetext, summary => 'Adding Bad Link Info' }); } } } # Add pages with bad links to Category:HasBadLinks if (keys %badpages) { foreach my $page (keys %badpages) { my $pagetext = $bot->get_text("$page"); $pagetext .= "\n\[\[Category:HasBadLinks\]\]\n\n"; $bot->edit({ page => $page, text => $pagetext, summary => 'Added HasBadLinks Category' }); } } $bot->logout();