// ==UserScript==
// @name Eza's Tumblr Scrape
// @namespace https://inkbunny.net/ezalias
// @description Creates a new page showing just the images from any Tumblr
// @license MIT
// @license Public domain / No rights reserved
// @include http://*?ezastumblrscrape*
// @include https://*?ezastumblrscrape*
// @include http://*/ezastumblrscrape*
// @include http://*.tumblr.com/
// @include https://*.tumblr.com/
// @include http://*.tumblr.com/page/*
// @include https://*.tumblr.com/page/*
// @include http://*.tumblr.com/tagged/*
// @include https://*.tumblr.com/tagged/*
// @include http://*.tumblr.com/archive
// @include http://*.co.vu/*
// @exclude *imageshack.us*
// @exclude *imageshack.com*
// @grant GM_registerMenuCommand
// @version 4.3
// @downloadURL none
// ==/UserScript==
// Create an imaginary page on the relevant Tumblr domain, mostly to avoid the ridiculous same-origin policy for public HTML pages. Populate page with all images from that Tumblr. Add links to this page on normal pages within the blog.
// This script also works on off-site Tumblrs, by the way - just add /archive?ezastumblrscrape?scrapewholesite after the ".com" or whatever. Sorry it's not more concise.
// Make it work, make it fast, make it pretty - in that order.
// TODO:
// I'll have to add filtering as some kind of text input... and could potentially do multi-tag filtering, if I can reliably identify posts and/or reliably match tag definitions to images and image sets.
// This is a good feature for doing /scrapewholesite to get text links and then paging through them with fancy dynamic presentation nonsense. Also: duplicate elision.
// I'd love to do some multi-scrape stuff, e.g. scraping both /tagged/homestuck and /tagged/art, but that requires some communication between divs to avoid constant repetition.
// I should start handling "after the cut" situations somehow, e.g. http://banavalope.tumblr.com/post/72117644857/roachpatrol-punispompouspornpalace-happy-new
// Just grab any link to a specific /post. Occasional duplication is fine, we don't care.
// Wait, shit. Every theme should link to every page. And my banavalope example doesn't even link to the same domain, so we couldn't get it with raw AJAX. Meh. It's just a rare problem we'll have to ignore.
// http://askleijon.tumblr.com/ezastumblrscrape is a good example - lots of posts link to outside images (mostly imgur)
// I could detect "read more" links if I can identify the text-content portion of posts. links to /post/ pages are universal theme elements, but become special when they're something the user links to intentionally.
// for example: narcisso's dream on http://cute-blue.tumblr.com/ only shows the cover because the rest is behind a break.
// post-level detection would also be great because it'd let me filter out reblogs. fuck all these people with 1000-page tumblrs, shitty animated gifs in their theme, infinite scrolling, and NO FUCKING TAGS. looking at you, http://neuroticnick.tumblr.com/post/16618331343/oh-gamzee#dnr - you prick.
// Look into Tumblr Saviour to see how they handle and filter out text posts.
// Should non-image links from images be gathered at the top of each 'page' on the image browser? E.g. http://askNSFWcobaltsnow.tumblr.com links to Derpibooru a lot. Should those be listed before the images?
// I worry it'd pick up a lot of crap, like facebook and the main page. More blacklists / whitelists. Save it for when individual posts are detected.
// ScrapeWholeSite: 10 pages at once by doing 10 separate xmlhttpwhatever objects, waiting for each to flip some bit in a 10-bool array? Clumsy parallelism. Possibly recursion, if the check for are-we-all-done-yet is in the status==4 callback.
// I should probably implement a box and button for choosing lastpage, just for noob usability's sake. Maybe it'd only appear if pages==2.
// Add a convenient interface for changing options? "Change browsing options" to unhide a div that lists every ?key=value pair, with text-entry boxes or radio buttons as appropriate, and a button that pushes a new URL into the address bar and re-hides the div. Would need to be separate from thumbnail toggle so long as anything false is suppressed in get_url or whatever.
// Dropdown menus? Thumbnails yes/no, Pages At Once 1-20. These change the options_map settings immediately, so next/prev links will use them. Link to Apply Changes uses same ?startpage as current.
// Could I generalize that the way I've generalized Image Glutton? E.g., grab all links from a Pixiv gallery page, show all images and all manga pages.
// Possibly @include any ?scrapeeverythingdammit to grab all links and embed all pictures found on them. single-jump recursive web mirroring. (fucking same-domain policy!)
// now that I've got key-value mapping, add a link for 'view original posts only (experimental).' er, 'hide reblogs?' difficult to accurately convey.
// make it an element of the post-scraping function. then it would also work on scrape-whole-tumblr.
// better yet: call it separately, then use the post-scraping function on each post-level chunk of HTML. i.e. call scrape_without_reblogs from scrape_whole_tumblr, split off each post into strings, and call soft_scrape_page( single_post_string ) to get all the same images.
// or would it be better to get all images from any post? doing this by-post means we aren't getting theme nonsense (mostly).
// maybe just exclude images where a link to another tumblr happens before the next image... no, text posts could screw that up.
// general post detection is about recognizing patterns. can we automate it heuristically? bear in mind it'd be done at least once per scrape-page, and possibly once per tumblr-page.
// Add picturepush.com to whitelist - or just add anything with an image file extension? Once we're filtering duplicates, Facebook buttons won't matter.
// user b84485 seems to be using the scrape-whole-site option to open image links in tabs, and so is annoyed by the 500/1280 duplicates. maybe a 'remove duplicates' button after the whole site's done?
// It's a legitimately good idea. Lord knows I prefer opening images in tabs under most circumstances.
// Basically I want a "Browse Links" page instead of just "grab everything that isn't nailed down."
// http://mekacrap.tumblr.com/post/82151443664/oh-my-looks-like-theres-some-pussy-under#dnr - lots of 'read more' stuff, for when that's implemented.
// eza's tumblr scrape: "read more" might be tumblr standard.
// e.g.
// http://c-enpai.tumblr.com/ - interesting content visible in /archive, but every page is 'themed' to be a blank front page. wtf.
// "Scrape" link should appear in /archive, for consistency. Damn thing's unclickable on some themes.
// why am I looking for specific domains to sort to the front? imgur, deviantart, etc. - just do it for any image that's not on *.tumblr.com, fool.
// chokes on multi-thousand-page tumblrs like actual-vriska, at least when listing all pages. it's just link-heavy text. maybe skip having a div for every page and just append to one div. or skip divs and append to the raw document innerHTML. it could be a memory thing, if ajax elements are never destroyed.
// multi-thousand-page tumblrs make "find image links from all pages" choke. massive memory use, massive CPU load. ridiculous. it's just text. (alright, it's links and ajax requests, but it's doggedly linear.)
// maybe skip individual divs and append the raw pile-of-links hypertext into one div. or skip divs entirely and append it straight to the document innerHTML.
// could it be a memory leak thing? are ajax elements getting properly released and destroyed when their scope ends? kind of ridiculous either way, considering we're holding just a few kilobytes of text per page.
// try re-using the same ajax object.
// Expand options_url to take an arbitrary list of key,value,key,value pairs.
// Escape function in JS is encodeURI. We need 'safe' URLs as tag IDs.
/* Assorted notes from another text file
. eza's tumblr scrape - testing open-loop vs. closed-loop updating for large tumblrs. caffeccino has 200-ish pages. from a cached state, and with stuff downloading, getting all 221 the old way takes 8m20s and has noticeable slowdown past 40-ish. new method takes 16m and is honestly not very fast from the outset. the use of a global variable might cause ugly locking. with js, who knows.
. eza's tumblr fixiv? de-style everything by simply erasing the "; // Auto by default, fixed-size if parent class includes 'thumbnail'
document.body.style.backgroundColor="#DDDDDD"; // Light grey BG to make image boundaries more obvious
var mydiv = document.getElementById( "maindiv" ); // I apologize for the generic name. This script used to be a lot simpler.
// Identify options in URL (in the form of ?key=value pairs)
var key_value_array = window.location.href.split( '?' ); // Knowing how to do it the hard way is less impressive than knowing how not to do it the hard way.
key_value_array.shift(); // The first element will be the site URL. Durrrr.
for( dollarsign of key_value_array ) { // forEach( key_value_array ), including clumsy homage to $_
var this_pair = dollarsign.split( '=' ); // Split key=value into [key,value] (or sometimes just [key])
if( this_pair.length < 2 ) { this_pair.push( true ); } // If there's no value for this key, make its value boolean True
if( this_pair[1] == "false " ) { this_pair[1] = false; } // If the value is the string "false" then make it False - note fun with 1-ordinal "length" and 0-ordinal array[element].
else if( !isNaN( parseInt( this_pair[1] ) ) ) { this_pair[1] = parseInt( this_pair[1] ); } // If the value string looks like a number, make it a number
options_map[ this_pair[0] ] = this_pair[1]; // options_map.key = value
}
if( options_map.find[ options_map.find.length - 1 ] == "/" ) { options_map.find = options_map.find.substring( 0, options_map.find.length - 1 ); }
// kludge - prevents example.tumblr.com//page/2 nonsense.
if( options_map.thumbnails ) { document.body.className = "thumbnails"; } // CSS approach to thumbnail sizing; className="" to toggle back to auto.
// Add tags to title, for archival and identification purposes
document.title += options_map.find.split('/').join(' '); // E.g. /tagged/example/chrono -> "tagged example chrono"
// Go to image browser or link scraper according to URL options.
mydiv.innerHTML = "Not all images are guaranteed to appear. "; // Thanks to JS's wacky accomodating nature, mydiv is global despite appearing in an if-else block.
if( options_map[ "scrapewholesite" ] ) {
scrape_whole_tumblr(); // Images from every page, presented as text links
} else {
scrape_tumblr_pages(); // Ten pages of embedded images at a time
}
} else { // If it's just a normal Tumblr page, add a link to the appropriate /ezastumblrscrape URL
// Add link(s) to the standard "+Follow / Dashboard" nonsense. Before +Follow, I think - to avoid messing with users' muscle memory.
// This is currently beyond my ability to dick with JS through a script in a plugin. Let's kludge it for immediate usability.
// kludge by Ivan - http://userscripts-mirror.org/scripts/review/65725.html
var url = window.location.protocol + "//" + window.location.hostname + "/archive?ezastumblrscrape?scrapewholesite?find=" + window.location.pathname;
// Preserve /tagged/tag/chrono, etc. Also preserve http: vs https: via "location.protocol".
if( url.indexOf( "/page/chrono" ) < 0 ) { // Basically checking for posts /tagged/page, thanks to Detective-Pony. Don't even ask.
if( url.lastIndexOf( "/page/" ) > 0 ) { url = url.substring( 0, url.lastIndexOf( "/page/" ) ); } // Don't include e.g. /page/2. We'll add that ourselves.
}
// Don't clean this up. It's not permanent.
var eLink = document.createElement("a");
eLink.setAttribute("id","edit_link");
eLink.setAttribute("style","position:absolute;top:26px;right:2px;padding:2px 0 0;width:50px;height:18px;display:block;overflow:hidden;-moz-border-radius:3px;background:#777;color:#fff;font-size:8pt;text-decoration:none;font-weight:bold;text-align:center;line-height:12pt;");
eLink.setAttribute("href", url);
eLink.appendChild(document.createTextNode("Scrape"));
var elBody = document.getElementsByTagName("body")[0];
elBody.appendChild(eLink);
// Greasemonkey now supports user commands through its add-on menu! Thus: no more manually typing /archive?ezastumblrscrape?scrapewholesite on blogs with uncooperative themes.
GM_registerMenuCommand( "Scrape whole Tumblr blog", go_to_scrapewholesite );
}
function go_to_scrapewholesite() {
var site = window.location.protocol + "//" + window.location.hostname + "/archive?ezastumblrscrape?scrapewholesite?find=" + window.location.pathname;
window.location.href = site;
}
// ------------------------------------ Whole-site scraper for use with DownThemAll ------------------------------------ //
// Monolithic scrape-whole-site function, recreating the original intent (before I added pages and made it a glorified multipage image browser)
// So for archiving, I need some kind of sister Perl script that goes 'foreach filename containing _500, if (regex _1280) exists, delete this _500 file.'
function scrape_whole_tumblr() {
var highest_known_page = 0;
var site = window.location.protocol + "//" + window.location.hostname + options_map.find; // http: + // + example.tumblr.com + /tagged/sherlock
// Link to image-viewing version, preserving current tags
mydiv.innerHTML += "
";
// Browse images instead (10 pages at once) / (1 page at once) / Show text links without duplicates (WIP) ?
// Find out how many pages we need to scrape.
if( isNaN( options_map.lastpage ) ) { options_map.lastpage = 0; }
highest_page = options_map.lastpage; // kludge. I'm lazy.
if( highest_page == 0 ) {
// Find upper bound in a small number of fetches. Ideally we'd skip this - some themes list e.g. "Page 1 of 24." I think that requires back-end cooperation.
mydiv.innerHTML += "Finding out how many pages are in " + site.substring( site.indexOf( '/' ) + 2 ) + ":
"; // Telling users what's going on.
for( var n = 100; n > 0 && n < 100001; n *= 10 ) { // 100,000 is an arbitrary upper bound. It used to arbitrarily be lower, and then I found some BIG tumblrs...
var siteurl = site + "/page/" + n + "/mobile";
var xmlhttp = new XMLHttpRequest();
xmlhttp.onreadystatechange=function() {
if( xmlhttp.readyState == 4 ) {
if( xmlhttp.responseText.indexOf( "/page/" + (n+1) ) < 0 ) { // Does this page link to the next page? Pages too far will only link backwards.
mydiv.innerHTML += siteurl + " is too high. ";
highest_page = n;
n = -1; // break for(n) loop
} else {
mydiv.innerHTML += siteurl + " exists. ";
highest_known_page = n;
}
}
}
xmlhttp.open("GET", siteurl, false); // false=synchronous, for linear execution. No point checking if a page is final if we've already sent requests for the next.
xmlhttp.send();
}
// Binary-search closer to the actual last page
// 1000+ page examples: http://neuroticnick.tumblr.com/ - http://teufeldiabolos.co.vu/ - http://actual-vriska.tumblr.com/ - http://cullenfuckers.tumblr.com/ - http://soupery.tumblr.com - some with 10,000 pages or more.
while( highest_page > highest_known_page + 10 ) { // Arbitrary cutoff. We're just minimizing the range. A couple extra pages is reasonable; a hundred is excessive.
mydiv.innerHTML +="Narrowing down last page: ";
var middlepage = parseInt( (highest_page + highest_known_page) / 2 ); // integer midpoint between highest-known and too-high pages
var siteurl = site + "/page/" + middlepage + "/mobile";
var xmlhttp = new XMLHttpRequest();
xmlhttp.onreadystatechange=function() {
if( xmlhttp.readyState == 4 ) {
if( xmlhttp.responseText.indexOf( "/page/" + (middlepage+1) ) < 0 ) { // Test for the presence of a link to the next page.
mydiv.innerHTML += siteurl + " is high. ";
highest_page = middlepage;
} else {
mydiv.innerHTML += siteurl + " exists. ";
highest_known_page = middlepage;
}
}
}
xmlhttp.open("GET", siteurl, false); // false=synchronous, for linear execution. No point checking if a page is final if we've already sent requests for the next dozen.
xmlhttp.send();
}
}
options_map.lastpage = highest_page;
document.getElementById( 'browse' ).href += "?lastpage=" + highest_page; // Add last-page indicator to Browse Images link
if( options_map.grabrange ) { // If we're only grabbing a 1000-page block from a huge-ass tumblr:
mydiv.innerHTML += " This will grab 1000 pages starting at " + options_map.grabrange + ".
";
} else { // If we really are describing the last page:
mydiv.innerHTML += " Last page is " + options_map.lastpage + " or lower.
";
}
if( options_map.lastpage > 1500 && !options_map.grabrange ) { // If we need to link to 1000-page blocks, and aren't currently inside one:
for( var x = 1; x < options_map.lastpage; x += 1000 ) { // For every 1000 pages...
// var decade_url = window.location.href + "?startpage=" + x + "?lastpage=" + (x+999);
var decade_url = window.location.href + "?grabrange=" + x + "?lastpage=" + options_map.lastpage;
mydiv.innerHTML += "Pages " + x + "-" + (x+999) + " "; // ... link a range of 1000 pages.
}
}
// Add button to scrape every page, one after another.
// Buttons within GreaseMonkey are a huge pain in the ass. I stole this from stackoverflow.com/questions/6480082/ - thanks, Brock Adams.
var button = document.createElement ('div');
button.innerHTML = '';
button.setAttribute ( 'id', 'scrape_button' ); // I'm really not sure why this id and the above HTML id aren't the same property.
document.body.appendChild ( button ); // Add button (at the end is fine)
document.getElementById ("myButton").addEventListener ( "click", scrape_all_pages, false ); // Activate button - when clicked, it triggers scrape_all_pages()
}
function scrape_all_pages() { // Example code implies that this function /can/ take a parameter via the event listener, but I'm not sure how.
var button = document.getElementById( "scrape_button" ); // First, remove the button. There's no reason it should be clickable twice.
button.parentNode.removeChild( button ); // The DOM can only remove elements from a higher level. "Elements can't commit suicide, but infanticide is permitted."
if( !options_map.imagesonly ) {
options_map.showlinks = true; // For scrapewholesite, include page links by default.
}
// We need to find "site" again, because we can't pass it. Activating a Greasemonkey function from a button borders on magic. Adding parameters is outright dark sorcery.
// Use a global variable, idiot. It's fine. Just do it. It's basically constant.
var site = window.location.protocol + "//" + window.location.hostname + options_map.find; // http: + // + example.tumblr.com + /tagged/sherlock
mydiv.innerHTML += "Scraping page: "; // This makes it easier to view progress, especially with AJAX preventing scrolling.
// Create divs for all pages' content, allowing asynchronous AJAX fetches
for( var x = 1; x <= highest_page; x++ ) {
var siteurl = site + "/page/" + x;
if( options_map.usemobile ) { siteurl += "/mobile"; } // If ?usemobile is flagged, scrape the mobile version.
var page_tag = siteurl.substring( siteurl.indexOf( '/page/' ) ); // Should be e.g. '/page/2' or '/page/2/mobile'
var new_div = document.createElement( 'div' );
new_div.id = '' + x;
document.body.appendChild( new_div );
}
// Fetch all pages with content on them
var page_counter_div = document.getElementById( 'pagecounter' ); // Probably minor, but over thousands of laggy page updates, I'll take any optimization.
url_index_array = new Array;
current_pending_connections = 0;
var begin_page = 1;
var end_page = highest_page;
if( !isNaN( options_map.grabrange ) ) { // If both ?startpage and ?lastpage are defined, grab only that range
begin_page = options_map.grabrange;
end_page = options_map.grabrange + 999; // NOT plus 1000. Stop making that mistake. First page + 999 = 1000 total.
if( end_page > options_map.lastpage ) { end_page = options_map.lastpage; } // Kludge
document.title += " " + (parseInt( begin_page / 1000 ) + 1); // Change page title to indicate which block of pages we're saving
}
for( var x = begin_page; x <= end_page; x++ ) {
var siteurl = site + "/page/" + x;
if( options_map.usemobile ) { siteurl += "/mobile"; } // If ?usemobile is flagged, scrape the mobile version. No theme shenanigans... but also no photosets. Sigh.
//page_counter_div.innerHTML = " " + x;
//asynchronous_fetch( siteurl, x ); // Sorry for the function spaghetti. Scrape_all_pages exists so a thousand pages aren't loaded in the background, and asynchronous_fetch prevents race conditions.
url_index_array.push( [siteurl, x] );
}
interval_object = window.setInterval( fetch_when_ready, 100 );
//document.getElementById( 'pagecounter' ).innerHTML += " Done. Use DownThemAll (or a similar plugin) to grab all these links.";
}
function fetch_when_ready() {
//console.log( 'glub' );
/*
while( current_pending_connections < 3 && url_index_array.length > 0 ) {
var sprog = url_index_array.shift();
asynchronous_fetch( sprog[0], sprog[1] );
current_pending_connections++;
document.getElementById( 'pagecounter' ).innerHTML = " " + sprog[1];
}
*/
if( current_pending_connections < 10 && url_index_array.length > 0 ) {
var sprog = url_index_array.shift();
asynchronous_fetch( sprog[0], sprog[1] );
current_pending_connections++;
document.getElementById( 'pagecounter' ).innerHTML = " " + sprog[1];
}
if( url_index_array.length == 0 ) {
window.clearInterval( interval_object );
//console.log( 'ribbit' );
document.getElementById( 'pagecounter' ).innerHTML += " Done. Use DownThemAll (or a similar plugin) to grab all these links.";
}
}
function asynchronous_fetch( siteurl, page ) { // separated into another function to prevent race condition (i.e. variables changing while asynronous request is happening)
var xmlhttp = new XMLHttpRequest(); // AJAX object
xmlhttp.onreadystatechange = function() { // When the request returns, this anonymous function will trigger (repeatedly, for various stages of the reply)
if( xmlhttp.readyState == 4 ) { // Don't do anything until we're done downloading the page.
var url_array = soft_scrape_page( xmlhttp.responseText ); // turn HTML dump into list of URLs
// Print URLs so DownThemAll (or similar) can grab them
var bulk_string = " " + siteurl + " "; // Repeatedly adding to innerHTML kills performance, so fill this "digest" and add it all.
for( var n = 0; n < url_array.length; n++ ) {
var image_url = url_array[n][1]; // url_array is an array of 2-element arrays. each inner array goes .
// Animated GIFs don't get resized, but still images do, so let's include the original size before altering image_url.
if( image_url.indexOf( '.gif' ) > -1 ) {
bulk_string += "" + image_url + " ";
}
// Some lower-size images are just automatically resized. We'll change the URL to the maximum size just in case, and Tumblr will provide the highest resolution.
image_url = image_url.replace( "_540.", "_1280." );
image_url = image_url.replace( "_500.", "_1280." );
image_url = image_url.replace( "_400.", "_1280." );
image_url = image_url.replace( "_250.", "_1280." );
image_url = image_url.replace( "_100.", "_1280." );
if( options_map.plaintext ) {
bulk_string += image_url + ' '; // Hopefully this reduces strain on Firefox. It leaks and gets weird past about 10,000 pages with links enabled.
} else {
bulk_string += "" + image_url + " "; // "These URLs don't need to be links, but why not?" 13K-page Tumblrs is why not.
}
}
var page_div = document.getElementById( '' + page );
page_div.innerHTML = bulk_string;
current_pending_connections--;
// bulk_string = ""; // Debug-ish - garbage collection doesn't seem reliable, RAM use is bloated for 10K+ pages. (No difference.)
// url_array = "";
}
}
// Every tenth page, synchronous? Wonky timing trick.
if( page % 10 == 0 ) {
xmlhttp.open("GET", siteurl, false); // This should be "true" for asynchronous at some point, but naively, it spams hundreds of GETs per second.
} else {
xmlhttp.open("GET", siteurl, true);
}
xmlhttp.send();
}
// Fetch function to replace all fetch functions: take string, return list
// If indexof('photoset') then treat as a photoset and return image urls. If indexof('video') then treat as a video and return video url. Etc.
function universal_fetch( siteurl, synchronous ) {
var xmlhttp = new XMLHttpRequest(); // AJAX object
xmlhttp.onreadystatechange = function() { // When the request returns, this anonymous function will trigger (repeatedly, for various stages of the reply)
if( xmlhttp.readyState == 4 ) { // Don't do anything until we're done downloading the page.
var urls = new Array;
if( siteurl.indexOf( '/photoset' ) > 0 ) {
/*
// Fetch photoset iframes and put their constituent images in url_array
if( image_url.indexOf( '/photoset_iframe/' ) > -1 ) {
var photoset_xml = new XMLHttpRequest();
photoset_xml.onreadystatechange = function() {
if( photoset_xml.readyState == 4 ) { // When loaded
var photo_link_array = photoset_xml.responseText.split( 'href="' ); // Doublequotes are sitewide-standard for photosets
for( var n = 1; n < photo_link_array.length; n++ ) {
var photo_link = photo_link_array[n].substring( 0, photo_link_array[n].indexOf( '"' ) ) + "#photoset"; // Isolate link with doublequote terminator, tag as a photoset
if( n == 1 ) { photo_link += "#" + image_url; } // Tag first image in set with photoset URL so browse mode can link to it
var sort_order = parseFloat( (0.01 * n) + x );
url_array.push( [ sort_order, photo_link ] ); // "x + 1 - 1/n" for order on page. E.g. 8.5, 8.333, 8.25, shit they'll sort backwards goddammit.
}
}
}
photoset_xml.open("GET", image_url, false);
photoset_xml.send();
image_url = ""; // Prevent any further action using this URL
}
*/
} /// else if
}
}
xmlhttp.open("GET", siteurl, synchronous); // This should probably be "true" for asynchronous at some point, but naively, it spams hundreds of GETs per second. This spider script shouldn't act like a DDOS.
xmlhttp.send();
}
// ------------------------------------ Multi-page scraper with embedded images ------------------------------------ //
function scrape_tumblr_pages() { // Create a page where many images are displayed as densely as seems sensible
// Figure out which site we're scraping
var site = window.location.protocol + "//" + window.location.hostname + options_map.find; // http: + // + example.tumblr.com + /tagged/sherlock
var next_link = options_url( "startpage", options_map.startpage + options_map.pagesatonce );
var prev_link = options_url( "startpage", options_map.startpage - options_map.pagesatonce );
options_url( "startpage", 1000 ); // debug - I think I'm getting side-effects from copy_map
if( !isNaN( parseInt( options_map.startpage ) ) && options_map.startpage <= 1 ) {
options_map.startpage = 1; // Reset in case it's screwy. Negative numbers work, but all return page 1 anyway.
var prev_next_controls = " Next >>>
";
}
mydiv.innerHTML += prev_next_controls;
document.getElementById("bottom_controls_div").innerHTML += prev_next_controls;
// Link to the thumbnail page or full-size-image page as appropriate
if( options_map.thumbnails ) { mydiv.innerHTML += "Switch to full-size images"; }
else { mydiv.innerHTML += "Switch to thumbnails"; }
// Toggle thumbnails via CSS, hopefully alter options_map accordingly
mydiv.innerHTML += " - Toggle image size";
if( options_map.pagesatonce == 1 ) { mydiv.innerHTML += " - Show ten pages at once"; }
else { mydiv.innerHTML += " - Show one page at once"; }
mydiv.innerHTML += " - Scrape whole Tumblr ";
// Grab several pages and extract/embed images.
start_page = parseInt( options_map.startpage ); // debug-ish. I'll use these more directly soon enough.
number_of_pages_at_once = parseInt( options_map.pagesatonce );
for( x = start_page; x < start_page + number_of_pages_at_once; x++ ) {
var siteurl = site + "/page/" + x;
if( options_map.usemobile ) { siteurl += "/mobile"; } // If ?usemobile is flagged, scrape mobile version. No theme shenanigans... but also no photosets. Sigh.
mydiv.innerHTML += "Page " + x + " fetched "; // TODO: Sanitize the URL here and in fetch_page. It's just a unique ID.
fetch_page( siteurl, mydiv ); // I'd rather do this right here, but unless the AJAX mess is inside its own function, matching a responseText to its siteurl is intractable.
}
}
function fetch_page( siteurl, mydiv ) { // Grab a page, scrape its image URLs, and embed them for easy browsing
var xmlhttp = new XMLHttpRequest(); // AJAX object
xmlhttp.onreadystatechange = function() { // When the request returns, this anonymous function will trigger (repeatedly, for various stages of the reply)
if( xmlhttp.readyState == 4 ) { // Don't do anything until we're done downloading the page.
var thisdiv = document.getElementById( siteurl ); // identify the div we printed for this page // TODO: Sanitize, as above. Code execution through this niche script is unlikely, but why keep it possible?
thisdiv.innerHTML += "" + siteurl + " "; // link to page, in case you want to see something in-situ (e.g. for proper sourcing)
var div_digest = ""; // Instead of updating each div's HTML for every image, we'll lump it into one string and update the page once per div. (Twice, counting the page link immediately above this.)
var video_array = new Array;
var outlink_array = new Array;
var inlink_array = new Array;
var url_array = soft_scrape_page( xmlhttp.responseText ); // turn HTML dump into list of URLs
url_array.push( [0, 'this is a kludge'] ); // Insert fake final item so url_array[n] doesn't shit itself when the last item is a video/offsite/local link
// Separate links
for( var n = url_array.length-1; n >=0; n-- ) {
if( url_array[n][1].indexOf( '#video' ) > -1 ) { video_array.push( url_array[n][1] ); url_array.splice( n, 1 ); }
if( url_array[n][1].indexOf( '#offsite' ) > -1 ) { outlink_array.push( url_array[n][1] ); url_array.splice( n, 1 ); }
if( url_array[n][1].indexOf( '#local' ) > -1 ) { inlink_array.push( url_array[n][1] ); url_array.splice( n, 1 ); }
}
url_array.pop(); // Get rid of fake final item
// Display video links, if there are any
for( var n = 0; n < video_array.length; n++ ) {
div_digest += "Video: " + video_array[n] + " "; // Link the video.
}
// Display page links, if the ?showlinks flag is enabled
outlink_array.sort( function(a,b) { return a[0] - b[0]; } ); // sort array of [counter, url] sub-arrays by counter value
inlink_array.sort( function(a,b) { return a[0] - b[0]; } );
if( options_map.showlinks ) {
div_digest += "Outgoing links: ";
for( var n = 0; n < outlink_array.length; n++ ) { div_digest += "O" + (n+1) + " "; }
div_digest += " " + "Same-Tumblr links: ";
for( var n = 0; n < inlink_array.length; n++ ) { div_digest += "T" + (n+1) + " "; }
div_digest += " ";
}
// Embed high-res images to be seen, clicked, and saved
for( var n = 0; n < url_array.length; n++ ) {
var image_url = url_array[n][1]; // Ease-of-coding hack.
// For images which might have been automatically resized, assume the highest resolution exists, and change the URL accordingly.
image_url = image_url.replace( "_540.", "_1280." ); // No need to check for indexOf _540, because replace fails politely.
image_url = image_url.replace( "_500.", "_1280." );
image_url = image_url.replace( "_400.", "_1280." );
image_url = image_url.replace( "_250.", "_1280." );
image_url = image_url.replace( "_100.", "_1280." );
// This clunky function looks for a lower-res image if the high-res version doesn't exist.
var on_error = 'if(this.src.indexOf("_1280")>0){this.src=this.src.replace("_1280","_500");}'; // Swap 1280 for 500
on_error += 'else if(this.src.indexOf("_500")>0){this.src=this.src.replace("_500","_400");}'; // Or swap 500 for 400
on_error += 'else if(this.src.indexOf("_400")>0){this.src=this.src.replace("_400","_250");}'; // Or swap 400 for 250
on_error += 'else{this.src=this.src.replace("_250","_100");this.onerror=null;}'; // Or swap 250 for 100, then give up
on_error += 'document.getElementById("' + image_url + '").href=this.src;'; // Link the image to itself, regardless of size
// Embed images (linked to themselves) and link to photosets
if( image_url.indexOf( "#" ) > 0 ) { // for photosets, print the photoset link.
var photoset_url = image_url.substring( image_url.lastIndexOf( "#" ) + 1 );
// separate everything past the last hash - it's like http://tumblr.com/image#photoset#http://tumblr.com/photoset_iframe
if( photoset_url.substring(0, 4) == "http" ) { div_digest += " Set:"; }
// if the #photoset tag is followed by an #http URL, link the URL
}
div_digest += " ";
}
div_digest += " (End of " + siteurl + ")"; // Another link to the page, because I'm tired of scrolling back up.
thisdiv.innerHTML += div_digest;
}
}
xmlhttp.open("GET", siteurl, true); // True = asynchronous. Finally got the damn thing to work! It's a right bitch to do in an inline function. JS scopes are screwy as hell.
xmlhttp.send();
}
// ------------------------------------ Universal page-scraping function (and other helper functions) ------------------------------------ //
function soft_scrape_page_redux( html_copy ) {
var url_array = new Array();
// Aha: there IS multi-splitting, using regexes as the delimiter. E.g. "Hello, awesome world!".split(/[\s,]+/); for splitting on spaces and commas.
//var http_array = html_copy.split( 'http' );
var http_array = html_copy.split( /['"]http/ );
for( var x in http_array ) {
//console.log( http_array[n].substring( 0, http_array[n].indexOf( /['"]/ ) ) );
//var url = http_array[n].substring( 0, http_array[n].indexOf( /['"]/ ) ); // Regexes don't work in indexOf because fuck you.
var delimiter = http_array[x].indexOf( '"' );
var delimiter2 = http_array[x].indexOf( "'" );
if( delimiter2 != -1 && delimiter2 < delimiter ) { delimiter = delimiter2; }
var url = "http" + http_array[x].substring( 0, delimiter );
// console.log( url );
// http_array[x] = url;
// Fetch photoset iframes and put their constituent images in url_array
// Error console keeps nagging me about synchronicity. Can I fob this off to a function (fetch_photoset(url)) and update the page later? Maybe just store each url_array in some self-updating fashion?
if( url.indexOf( '/photoset_iframe/' ) > -1 ) {
var photoset_xml = new XMLHttpRequest();
photoset_xml.onreadystatechange = function() {
if( photoset_xml.readyState == 4 ) { // When loaded
var photo_link_array = photoset_xml.responseText.split( 'href="' ); // Doublequotes are sitewide-standard for photosets
for( var n = 1; n < photo_link_array.length; n++ ) {
var photo_link = photo_link_array[n].substring( 0, photo_link_array[n].indexOf( '"' ) ) + "#photoset"; // Isolate link with doublequote terminator, tag photoset
if( n == 1 ) { photo_link += "#" + url; } // Tag first image in set with photoset URL so browse mode can link to it
var sort_order = parseFloat( (0.01 * n) + x ); // This is completely fucked.
sort_order = x;
url_array.push( [ sort_order, photo_link ] ); // "x + 1 - 1/n" for order on page. E.g. 8.5, 8.333, 8.25, shit they'll sort backwards goddammit.
}
}
}
photoset_xml.open("GET", url, false);
photoset_xml.send();
//console.log( url );
url = ""; // Prevent any further action using this URL
}
// Fetch video iframes and put their (modified) video file addresses in url_array
if( url.indexOf( ".tumblr.com/video/" ) > -1 ) {
var subdomain = url.split( '/' ); // E.g. https://www.tumblr.com/video/examplename/123456/500/ -> https,,www.tumblr.com,video,examplename,123456,500
var video_iframe = window.location.protocol + "//" + subdomain[4] + ".tumblr.com/video/" + subdomain[4] + "/" + subdomain[5] + "/" + subdomain[6];
// e.g. http://examplename.tumblr.com/video/examplename/123456/500/
// Offsite tumblrs probably fail at this. I need to figure out crossorigin="anonymous" or whatever. CORS is a pain in my ass.
var video_xml = new XMLHttpRequest(); // Fetch video iframe, get actual video-file address
video_xml.onreadystatechange = function() {
if( video_xml.readyState == 4 ) { // When loaded
var video_pointer = video_xml.responseText.indexOf( '