// ==UserScript==
// @name Eza's Tumblr Scrape
// @namespace https://inkbunny.net/ezalias
// @description Creates a new page showing just the images from any Tumblr
// @license MIT
// @license Public domain / No rights reserved
// @include http://*?ezastumblrscrape*
// @include https://*?ezastumblrscrape*
// @include http://*/ezastumblrscrape*
// @include http://*.tumblr.com/
// @include https://*.tumblr.com/
// @include http://*.tumblr.com/page/*
// @include https://*.tumblr.com/page/*
// @include http://*.tumblr.com/tagged/*
// @include https://*.tumblr.com/tagged/*
// @include http://*.tumblr.com/archive
// @include http://*.co.vu/*
// @exclude *imageshack.us*
// @exclude *imageshack.com*
// @grant GM_registerMenuCommand
// @version 5.0
// @downloadURL none
// ==/UserScript==
// Create an imaginary page on the relevant Tumblr domain, mostly to avoid the ridiculous same-origin policy for public HTML pages. Populate page with all images from that Tumblr. Add links to this page on normal pages within the blog.
// This script also works on off-site Tumblrs, by the way - just add /archive?ezastumblrscrape?scrapewholesite after the ".com" or whatever. Sorry it's not more concise.
// Make it work, make it fast, make it pretty - in that order.
// TODO:
// I'll have to add filtering as some kind of text input... and could potentially do multi-tag filtering, if I can reliably identify posts and/or reliably match tag definitions to images and image sets.
// This is a good feature for doing /scrapewholesite to get text links and then paging through them with fancy dynamic presentation nonsense. Also: duplicate elision.
// I'd love to do some multi-scrape stuff, e.g. scraping both /tagged/homestuck and /tagged/art, but that requires some communication between divs to avoid constant repetition.
// I should start handling "after the cut" situations somehow, e.g. http://banavalope.tumblr.com/post/72117644857/roachpatrol-punispompouspornpalace-happy-new
// Just grab any link to a specific /post. Occasional duplication is fine, we don't care.
// Wait, shit. Every theme should link to every page. And my banavalope example doesn't even link to the same domain, so we couldn't get it with raw AJAX. Meh. It's just a rare problem we'll have to ignore.
// http://askleijon.tumblr.com/ezastumblrscrape is a good example - lots of posts link to outside images (mostly imgur)
// I could detect "read more" links if I can identify the text-content portion of posts. links to /post/ pages are universal theme elements, but become special when they're something the user links to intentionally.
// for example: narcisso's dream on http://cute-blue.tumblr.com/ only shows the cover because the rest is behind a break.
// post-level detection would also be great because it'd let me filter out reblogs. fuck all these people with 1000-page tumblrs, shitty animated gifs in their theme, infinite scrolling, and NO FUCKING TAGS. looking at you, http://neuroticnick.tumblr.com/post/16618331343/oh-gamzee#dnr - you prick.
// Look into Tumblr Saviour to see how they handle and filter out text posts.
// Should non-image links from images be gathered at the top of each 'page' on the image browser? E.g. http://askNSFWcobaltsnow.tumblr.com links to Derpibooru a lot. Should those be listed before the images?
// I worry it'd pick up a lot of crap, like facebook and the main page. More blacklists / whitelists. Save it for when individual posts are detected.
// ScrapeWholeSite: 10 pages at once by doing 10 separate xmlhttpwhatever objects, waiting for each to flip some bit in a 10-bool array? Clumsy parallelism. Possibly recursion, if the check for are-we-all-done-yet is in the status==4 callback.
// I should probably implement a box and button for choosing lastpage, just for noob usability's sake. Maybe it'd only appear if pages==2.
// Add a convenient interface for changing options? "Change browsing options" to unhide a div that lists every ?key=value pair, with text-entry boxes or radio buttons as appropriate, and a button that pushes a new URL into the address bar and re-hides the div. Would need to be separate from thumbnail toggle so long as anything false is suppressed in get_url or whatever.
// Dropdown menus? Thumbnails yes/no, Pages At Once 1-20. These change the options_map settings immediately, so next/prev links will use them. Link to Apply Changes uses same ?startpage as current.
// Could I generalize that the way I've generalized Image Glutton? E.g., grab all links from a Pixiv gallery page, show all images and all manga pages.
// Possibly @include any ?scrapeeverythingdammit to grab all links and embed all pictures found on them. single-jump recursive web mirroring. (fucking same-domain policy!)
// now that I've got key-value mapping, add a link for 'view original posts only (experimental).' er, 'hide reblogs?' difficult to accurately convey.
// make it an element of the post-scraping function. then it would also work on scrape-whole-tumblr.
// better yet: call it separately, then use the post-scraping function on each post-level chunk of HTML. i.e. call scrape_without_reblogs from scrape_whole_tumblr, split off each post into strings, and call soft_scrape_page( single_post_string ) to get all the same images.
// or would it be better to get all images from any post? doing this by-post means we aren't getting theme nonsense (mostly).
// maybe just exclude images where a link to another tumblr happens before the next image... no, text posts could screw that up.
// general post detection is about recognizing patterns. can we automate it heuristically? bear in mind it'd be done at least once per scrape-page, and possibly once per tumblr-page.
// Add picturepush.com to whitelist - or just add anything with an image file extension? Once we're filtering duplicates, Facebook buttons won't matter.
// user b84485 seems to be using the scrape-whole-site option to open image links in tabs, and so is annoyed by the 500/1280 duplicates. maybe a 'remove duplicates' button after the whole site's done?
// It's a legitimately good idea. Lord knows I prefer opening images in tabs under most circumstances.
// Basically I want a "Browse Links" page instead of just "grab everything that isn't nailed down."
// http://mekacrap.tumblr.com/post/82151443664/oh-my-looks-like-theres-some-pussy-under#dnr - lots of 'read more' stuff, for when that's implemented.
// eza's tumblr scrape: "read more" might be tumblr standard.
// e.g.
// http://c-enpai.tumblr.com/ - interesting content visible in /archive, but every page is 'themed' to be a blank front page. wtf.
// "Scrape" link should appear in /archive, for consistency. Damn thing's unclickable on some themes.
// why am I looking for specific domains to sort to the front? imgur, deviantart, etc. - just do it for any image that's not on *.tumblr.com, fool.
// chokes on multi-thousand-page tumblrs like actual-vriska, at least when listing all pages. it's just link-heavy text. maybe skip having a div for every page and just append to one div. or skip divs and append to the raw document innerHTML. it could be a memory thing, if ajax elements are never destroyed.
// multi-thousand-page tumblrs make "find image links from all pages" choke. massive memory use, massive CPU load. ridiculous. it's just text. (alright, it's links and ajax requests, but it's doggedly linear.)
// maybe skip individual divs and append the raw pile-of-links hypertext into one div. or skip divs entirely and append it straight to the document innerHTML.
// could it be a memory leak thing? are ajax elements getting properly released and destroyed when their scope ends? kind of ridiculous either way, considering we're holding just a few kilobytes of text per page.
// try re-using the same ajax object.
// Expand options_url to take an arbitrary list of key,value,key,value pairs.
/* Assorted notes from another text file
. eza's tumblr scrape - testing open-loop vs. closed-loop updating for large tumblrs. caffeccino has 200-ish pages. from a cached state, and with stuff downloading, getting all 221 the old way takes 8m20s and has noticeable slowdown past 40-ish. new method takes 16m and is honestly not very fast from the outset. the use of a global variable might cause ugly locking. with js, who knows.
. eza's tumblr fixiv? de-style everything by simply erasing the "; // Auto by default, fixed-size if parent class includes 'thumbnail'
document.body.style.backgroundColor="#DDDDDD"; // Light grey BG to make image boundaries more obvious
var mydiv = document.getElementById( "maindiv" ); // I apologize for the generic name. This script used to be a lot simpler.
// Identify options in URL (in the form of ?key=value pairs)
var key_value_array = window.location.href.split( '?' ); // Knowing how to do it the hard way is less impressive than knowing how not to do it the hard way.
key_value_array.shift(); // The first element will be the site URL. Durrrr.
for( dollarsign of key_value_array ) { // forEach( key_value_array ), including clumsy homage to $_
var this_pair = dollarsign.split( '=' ); // Split key=value into [key,value] (or sometimes just [key])
if( this_pair.length < 2 ) { this_pair.push( true ); } // If there's no value for this key, make its value boolean True
if( this_pair[1] == "false " ) { this_pair[1] = false; } // If the value is the string "false" then make it False - note fun with 1-ordinal "length" and 0-ordinal array[element].
else if( !isNaN( parseInt( this_pair[1] ) ) ) { this_pair[1] = parseInt( this_pair[1] ); } // If the value string looks like a number, make it a number
options_map[ this_pair[0] ] = this_pair[1]; // options_map.key = value
}
if( options_map.find[ options_map.find.length - 1 ] == "/" ) { options_map.find = options_map.find.substring( 0, options_map.find.length - 1 ); }
// kludge - prevents example.tumblr.com//page/2 nonsense.
if( options_map.thumbnails ) { document.body.className = "thumbnails"; } // CSS approach to thumbnail sizing; className="" to toggle back to auto.
// Add tags to title, for archival and identification purposes
document.title += options_map.find.split('/').join(' '); // E.g. /tagged/example/chrono -> "tagged example chrono"
// Go to image browser or link scraper according to URL options.
mydiv.innerHTML = "Not all images are guaranteed to appear. "; // Thanks to JS's wacky accomodating nature, mydiv is global despite appearing in an if-else block.
if( options_map[ "scrapewholesite" ] ) {
scrape_whole_tumblr(); // Images from every page, presented as text links
} else {
scrape_tumblr_pages(); // Ten pages of embedded images at a time
}
} else { // If it's just a normal Tumblr page, add a link to the appropriate /ezastumblrscrape URL
// Add link(s) to the standard "+Follow / Dashboard" nonsense. Before +Follow, I think - to avoid messing with users' muscle memory.
// This is currently beyond my ability to dick with JS through a script in a plugin. Let's kludge it for immediate usability.
// kludge by Ivan - http://userscripts-mirror.org/scripts/review/65725.html
var url = window.location.protocol + "//" + window.location.hostname + "/archive?ezastumblrscrape?scrapewholesite?find=" + window.location.pathname;
// Preserve /tagged/tag/chrono, etc. Also preserve http: vs https: via "location.protocol".
if( url.indexOf( "/page/chrono" ) < 0 ) { // Basically checking for posts /tagged/page, thanks to Detective-Pony. Don't even ask.
if( url.lastIndexOf( "/page/" ) > 0 ) { url = url.substring( 0, url.lastIndexOf( "/page/" ) ); } // Don't include e.g. /page/2. We'll add that ourselves.
}
// Don't clean this up. It's not permanent.
var eLink = document.createElement("a");
eLink.setAttribute("id","edit_link");
eLink.setAttribute("style","position:absolute;top:26px;right:2px;padding:2px 0 0;width:50px;height:18px;display:block;overflow:hidden;-moz-border-radius:3px;background:#777;color:#fff;font-size:8pt;text-decoration:none;font-weight:bold;text-align:center;line-height:12pt;");
eLink.setAttribute("href", url);
eLink.appendChild(document.createTextNode("Scrape"));
var elBody = document.getElementsByTagName("body")[0];
elBody.appendChild(eLink);
// Greasemonkey now supports user commands through its add-on menu! Thus: no more manually typing /archive?ezastumblrscrape?scrapewholesite on blogs with uncooperative themes.
GM_registerMenuCommand( "Scrape whole Tumblr blog", go_to_scrapewholesite );
}
function go_to_scrapewholesite() {
var site = window.location.protocol + "//" + window.location.hostname + "/archive?ezastumblrscrape?scrapewholesite?find=" + window.location.pathname;
window.location.href = site;
}
// ------------------------------------ Whole-site scraper for use with DownThemAll ------------------------------------ //
// Monolithic scrape-whole-site function, recreating the original intent (before I added pages and made it a glorified multipage image browser)
// So for archiving, I need some kind of sister Perl script that goes 'foreach filename containing _500, if (regex _1280) exists, delete this _500 file.'
function scrape_whole_tumblr() {
var highest_known_page = 0;
var site = window.location.protocol + "//" + window.location.hostname + options_map.find; // http: + // + example.tumblr.com + /tagged/sherlock
// Link to image-viewing version, preserving current tags
mydiv.innerHTML += "
";
// Browse images instead (10 pages at once) / (1 page at once) / Show text links without duplicates (WIP) ?
// Find out how many pages we need to scrape.
if( isNaN( options_map.lastpage ) ) {
// Find upper bound in a small number of fetches. Ideally we'd skip this - some themes list e.g. "Page 1 of 24." I think that requires back-end cooperation.
mydiv.innerHTML += "Finding out how many pages are in " + site.substring( site.indexOf( '/' ) + 2 ) + ":
"; // Telling users what's going on.
// Returns page number if there's no Next link, or negative page number if there is a Next link.
// Only for use on /mobile pages; relies on Tumblr's shitty standard theme
function test_next_page( body ) {
var link_index = body.indexOf( 'rel="canonical"' ); //
var page_index = body.indexOf( '/page/', link_index );
var terminator_index = body.indexOf( '"', page_index );
var this_page = parseInt( body.substring( page_index+6, terminator_index ) );
if( body.indexOf( '>next<' ) > 0 ) { return -this_page; } else { return this_page }
}
// Generates an array of length "steps" between given boundaries - or near enough, for sanity's sake
function array_between_bounds( lower_bound, upper_bound, steps ) {
if( lower_bound > upper_bound ) { // Swap if out-of-order.
var temp = lower_bound; lower_bound = upper_bound, upper_bound = temp;
}
var bound_range = upper_bound - lower_bound;
if( steps > bound_range ) { steps = bound_range; } // Steps <= bound_range, but steps > 1 to avoid division by zero:
var pages_per_test = parseInt( bound_range / steps ); // Steps-1 here, so first element is lower_bound & last is upper_bound. Off-by-one errors, whee...
var range = Array( steps )
.fill( lower_bound )
.map( (value,index) => value += index * pages_per_test );
range.push( upper_bound );
return range;
}
// Given a (presumably sorted) list of page numbers, find the last that exists and the first that doesn't exist.
function find_reasonable_bound( test_array ) {
return Promise.all( test_array.map( pagenum => fetch( site + '/page/' + pagenum + '/mobile' ) ) )
.then( responses => Promise.all( responses.map( response => response.text() ) ) )
.then( pages => pages.map( page => test_next_page( page ) ) )
.then( numbers => {
var lower_index = -1;
numbers.forEach( (value,index) => { if( value < 0 ) { lower_index++; } } ); // Count the negative numbers (i.e., count the pages with known content)
if( lower_index < 0 ) { lower_index = 0; }
var bounds = [ Math.abs(numbers[lower_index]), numbers[lower_index+1] ]
mydiv.innerHTML += "Last page is between " + bounds[0] + " and " + bounds[1] + ". ";
return bounds;
} )
}
// Repeatedly narrow down how many pages we're talking about; find a reasonable "last" page
find_reasonable_bound( [2, 10, 100, 1000, 10000, 100000] ) // Are we talking a couple pages, or a shitload of pages?
.then( pair => find_reasonable_bound( array_between_bounds( pair[0], pair[1], 20 ) ) ) // Narrow it down. Fewer rounds of more fetches works best.
.then( pair => find_reasonable_bound( array_between_bounds( pair[0], pair[1], 20 ) ) ) // Time is round count, fetches add up, selectivity is fetches x fetches.
.then( pair => {
options_map.lastpage = pair[1];
start_scraping_button();
} );
}
else { // If we're given the highest page by the URL, just use that
start_scraping_button();
}
// Add "Scrape" button to the page. This will grab images and links from many pages and list them page-by-page.
function start_scraping_button() {
document.getElementById( 'browse' ).href += "?lastpage=" + options_map.lastpage; // Add last-page indicator to Browse Images link
if( options_map.grabrange ) { // If we're only grabbing a 1000-page block from a huge-ass tumblr:
mydiv.innerHTML += " This will grab 1000 pages starting at " + options_map.grabrange + ".
";
} else { // If we really are describing the last page:
mydiv.innerHTML += " Last page is " + options_map.lastpage + " or lower.
";
}
if( options_map.lastpage > 1500 && !options_map.grabrange ) { // If we need to link to 1000-page blocks, and aren't currently inside one:
for( var x = 1; x < options_map.lastpage; x += 1000 ) { // For every 1000 pages...
var decade_url = window.location.href + "?grabrange=" + x + "?lastpage=" + options_map.lastpage;
mydiv.innerHTML += "Pages " + x + "-" + (x+999) + " "; // ... link a range of 1000 pages.
}
}
// Add button to scrape every page, one after another.
// Buttons within GreaseMonkey are a huge pain in the ass. I stole this from stackoverflow.com/questions/6480082/ - thanks, Brock Adams.
var button = document.createElement ('div');
button.innerHTML = '';
button.setAttribute ( 'id', 'scrape_button' ); // I'm really not sure why this id and the above HTML id aren't the same property.
document.body.appendChild ( button ); // Add button (at the end is fine)
document.getElementById ("myButton").addEventListener ( "click", scrape_all_pages, false ); // Activate button - when clicked, it triggers scrape_all_pages()
}
}
function scrape_all_pages() { // Example code implies that this function /can/ take a parameter via the event listener, but I'm not sure how.
var button = document.getElementById( "scrape_button" ); // First, remove the button. There's no reason it should be clickable twice.
button.parentNode.removeChild( button ); // The DOM can only remove elements from a higher level. "Elements can't commit suicide, but infanticide is permitted."
if( !options_map.imagesonly ) {
options_map.showlinks = true; // For scrapewholesite, include page links by default.
}
// We need to find "site" again, because we can't pass it. Activating a Greasemonkey function from a button borders on magic. Adding parameters is outright dark sorcery.
// Use a global variable, idiot. It's fine. Just do it. It's basically constant.
var site = window.location.protocol + "//" + window.location.hostname + options_map.find; // http: + // + example.tumblr.com + /tagged/sherlock
mydiv.innerHTML += "Scraping page: "; // This makes it easier to view progress,
// Create divs for all pages' content, allowing asynchronous AJAX fetches
var x = 1;
var div_end_page = options_map.lastpage;
if( !isNaN( options_map.grabrange ) ) { // If grabbing 1000 pages from the middle of 10,000, don't create 0..10,000 divs
x = options_map.grabrange;
div_end_page = x + 1000; // Should be +999, but whatever, no harm in tiny overshoot
}
for( ; x <= div_end_page; x++ ) {
var siteurl = site + "/page/" + x;
if( options_map.usemobile ) { siteurl += "/mobile"; } // If ?usemobile is flagged, scrape the mobile version.
var new_div = document.createElement( 'div' );
new_div.id = '' + x;
document.body.appendChild( new_div );
}
// Fetch all pages with content on them
var page_counter_div = document.getElementById( 'pagecounter' ); // Probably minor, but over thousands of laggy page updates, I'll take any optimization.
pagecounter.innerHTML = "" + 1;
url_index_array = new Array;
current_pending_connections = 0;
var begin_page = 1;
var end_page = options_map.lastpage;
if( !isNaN( options_map.grabrange ) ) { // If a range is defined, grab only 1000 pages starting there
begin_page = options_map.grabrange;
end_page = options_map.grabrange + 999; // NOT plus 1000. Stop making that mistake. First page + 999 = 1000 total.
if( end_page > options_map.lastpage ) { end_page = options_map.lastpage; } // Kludge
document.title += " " + (parseInt( begin_page / 1000 ) + 1); // Change page title to indicate which block of pages we're saving
}
// Generate array of URL/pagenum pair-arrays
for( var x = begin_page; x <= end_page; x++ ) {
var siteurl = site + "/page/" + x;
if( options_map.usemobile ) { siteurl += "/mobile"; } // If ?usemobile is flagged, scrape the mobile version. No theme shenanigans... but also no photosets. Sigh.
url_index_array.push( [siteurl, x] );
}
// Fetch, scrape, and display all URLs. Uses promises to work in parallel and promise.all to limit speed and memory (mostly for reliability's sake).
var simultaneous_fetches = 25;
var chain = Promise.resolve(0); // Empty promise so we can use "then"
for( var x = begin_page; x <= end_page; x += simultaneous_fetches ) { // In batches of e.g. 10, add Promise.all arrays to fetch URLs
chain = chain.then( s => {
var subarray = url_index_array.splice( 0, simultaneous_fetches ); // Shift e.g. first 10 elements into separate array, for partial array.map
return Promise.all( subarray.map( page =>
Promise.all( [ fetch( page[0] ).then( s => s.text() ), page[1], page[0] ] ) // Return [ body of page, page number, page URL ]
) )
} )
.then( responses => responses.map( s => { // Scrape URLs for links and images, display on page
var pagenum = s[1];
var page_url = s[2];
var url_array = soft_scrape_page_promise( s[0] ) // Surprise, this is a promise now
.then( urls => {
// Print URLs so DownThemAll (or similar) can grab them
var bulk_string = " " + page_url + " "; // A digest, so we can update innerHTML just once per div
urls.forEach( (value,index,array) => {
if( options_map.plaintext ) {
bulk_string += value + ' ';
} else {
bulk_string += '' + value + ' ';
}
} )
document.getElementById( '' + pagenum ).innerHTML = bulk_string;
if( parseInt( pagecounter.innerHTML ) < pagenum ) { pagecounter.innerHTML = "" + pagenum; } // Increment pagecounter (where sensible)
} );
} )
)
}
chain = chain.then( s => { document.getElementById( 'afterpagecounter' ).innerHTML = "Done. Use DownThemAll (or a similar plugin) to grab all these links."; } )
}
// ------------------------------------ Multi-page scraper with embedded images ------------------------------------ //
function scrape_tumblr_pages() {
// Figure out which site we're scraping
var site = window.location.protocol + "//" + window.location.hostname + options_map.find; // http: + // + example.tumblr.com + /tagged/sherlock
if( isNaN( parseInt( options_map.startpage ) ) || options_map.startpage <= 1 ) { options_map.startpage = 1; }
var next_link = options_url( "startpage", options_map.startpage + options_map.pagesatonce );
var prev_link = options_url( "startpage", options_map.startpage - options_map.pagesatonce );
var prev_next_controls = " ";
if( options_map.startpage > 1 ) { prev_next_controls += "<<< Previous - "; }
prev_next_controls += "Next >>>
";
mydiv.innerHTML += prev_next_controls;
document.getElementById("bottom_controls_div").innerHTML += prev_next_controls;
// Link to the thumbnail page or full-size-image page as appropriate
if( options_map.thumbnails ) { mydiv.innerHTML += "Switch to full-size images"; }
else { mydiv.innerHTML += "Switch to thumbnails"; }
// Toggle thumbnails via CSS, hopefully alter options_map accordingly
mydiv.innerHTML += " - Toggle image size";
if( options_map.pagesatonce == 1 ) { mydiv.innerHTML += " - Show ten pages at once"; }
else { mydiv.innerHTML += " - Show one page at once"; }
mydiv.innerHTML += " - Scrape whole Tumblr ";
// Fill an array with the page URLs to be scraped (and create per-page divs while we're at it)
var pages = new Array( parseInt( options_map.pagesatonce ) )
.fill( parseInt( options_map.startpage ) )
.map( (value,index) => value+index );
pages.forEach( pagenum => {
mydiv.innerHTML += "