' );
let story = s[0].substring( story_start, story_end );
// Problem: images are still showing up. We sort of don't want that. Clunk.
story = story.replace( /
![]()
" + post_url + "
"; // A digest, so we can update innerHTML just once per div
sublinks.forEach( (link) => {
let contents = link;
if( options_map.thumbnails == 'xml' && link.indexOf( '_1280' ) > -1 ) { // If we're showing thumbnails and this image can be resized, do, then show it
let img = link.replace( '_1280', '_100' );
contents = '

' + link; //
![]()
deserves a class, for consistent scale.
}
this_link = '
' + contents + '';
// How is this not what's causing the CSS bleed?
// if( link.substring(0) == '\n' ) { // If this is text
if( link.indexOf( '\n' ) > -1 ) { // If this is text
this_link = link;
}
bulk_string += this_link;
} )
var tag_string = "";
tag_links.forEach( (link) => {
// let tag = link.split( '/tagged/' )[1]
tag_string += '#' + link.split( '/tagged/' )[1] + ' ';
} )
bulk_string += tag_string + "
";
// Tags should be added here. If we have them.
// console.log( bulk_string );
document.getElementById( '' + post_url ).innerHTML = bulk_string; // Yeeeah, I should probably create these div IDs before this happens.
// And here's where I'd increase the page counter, if we had one.
// Let's use order_array to judge how done we are - and make it a percent, not a page count. Use order_array.length and pretend the last element's the same size.
// document.getElementById( 'pagecounter' ).innerHTML = '%';
// No wait, we can't do it here. This is per-post, not per-page. Or... we could do it real half-assed.
} )
)
.then( s => { // I don't think we take any actual data here. This just fires once per 'responses' group, so we can indicate page count etc.
let completion = Math.ceil( 100 * (which_entry+1) / order_array.length ); // Zero-ordinal index to percentage. Blugh.
document.getElementById( 'pagecounter' ).innerHTML = '' + completion + '%';
} )
} )
// "Promises allow a flat execution pattern!" Fuck you, you liars. Look at that rat's nest of alternating braces.
// If you've done all the work in spaghetti functions somewhere else, maybe it's fine, but if you want code to happen where it fucking starts, anonymous functions SUCK.
}
// ------------------------------------ Post-by-post scraper with embedded images ------------------------------------ //
// Scrape each page for /post/ links, scrape each /post/ for content, display in-order with less callback hell
// New layout & new scrape method - not required to be compatible with previous functions
function new_embedded_display() {
if( isNaN( parseInt( options_map.startpage ) ) || options_map.startpage <= 1 ) { options_map.startpage = 1; }
mydiv.innerHTML += "
" + html_previous_next_navigation() + "
" + html_page_count_navigation() + "";
document.getElementById("bottom_controls_div").innerHTML += "
" + html_page_count_navigation() + "
" + html_previous_next_navigation() + "";
// Links out from this mode - scrapewholesite, original mode, maybe other crap
//mydiv.innerHTML += "This mode is under development and subject to change."; // No longer true. It's basically feature-complete.
// mydiv.innerHTML += " -
Return to original image browser" + "
" + "
";
mydiv.innerHTML += "
" + html_ezastumblrscrape_options() + "
";
// "Pages 1 to 10 (of 100) from http://example.tumblr.com"
mydiv.innerHTML += "Pages " + options_map.startpage + " to " + (options_map.startpage + options_map.pagesatonce - 1);
if( !isNaN(options_map.lastpage) ) { mydiv.innerHTML += " (of " + options_map.lastpage + ")"; }
mydiv.innerHTML += " from " + site_and_tags + "
";
mydiv.innerHTML += image_size_options() + "
";
mydiv.innerHTML += image_resolution_options() + "
";
// Messy inline function for toggling page breaks - they're optional because we have post permalinks now
mydiv.innerHTML += "
Toggle page breaks";
mydiv.innerHTML += "
"; // Empty span for things to be placed after.
posts_placed.push( 0 ); // Because fuck special cases.
// Scrape some pages
for( let x = options_map.startpage; x < options_map.startpage + options_map.pagesatonce; x++ ) {
fetch( site_and_tags + "/page/" + x, { credentials: 'include' } ).then( r => r.text() ).then( text => {
scrape_by_posts( text, x );
} )
}
}
// Take the HTML from a /page, fetch the /post links, display images
// Probably ought to be despaghettified and combined with the above function, but I was fighting callback hell -hard- after the last major version
// Alternately, split it even further and do some .then( do_this ).then( do_that ) kinda stuff above.
function scrape_by_posts( html_copy, page_number ) {
// console.log( page_dupe_hash ); // DEBUG
let posts = links_from_page( html_copy ); // Get links on page
posts = posts.filter( link => { return link.indexOf( '/post/' ) > 0 && link.indexOf( '/photoset' ) < 0; } ); // Keep /post links but not photoset iframes
posts = posts.map( link => { return link.replace( '#notes', '' ); } ); // post/1234 is the same as /post/1234#notes
posts = posts.filter( link => link.indexOf( window.location.host ) > 0 ); // Same-origin filter. Not necessary, but it unclutters the console. Fuckin' CORS.
if( page_number != 1 ) { posts = posts.filter( novelty_filter ); } // Attempt to remove posts linked on every page, e.g. commission info. Suffers a race condition.
posts = remove_duplicates( posts ); // De-dupe
// 'posts' now contains an array of /post URLs
// Display link and linebreak before first post on this page
let first_id = posts.map( u => parseInt( u.split( '/' )[4] ) ).sort( ).pop(); // Grab ID from its place in each URL, sort accordingly, take the top one
let page_link = "
Page " + page_number + "";
if( posts.length == 0 ) { first_id = 1; page_link += " - No images found."; } // Handle empty pages with dummy content. Out of order, but whatever.
page_link += "
";
display_post( page_link, first_id + 0.5 ); // +/- on the ID will change with /chrono, once that matters
posts.map( link => {
fetch( link, { credentials: 'include' } ).then( r => r.text() ).then( text => {
let sublinks = links_from_page( text );
sublinks = sublinks.filter( s => { return s.indexOf( '.jpg' ) > 0 || s.indexOf( '.jpeg' ) > 0 || s.indexOf( '.png' ) > 0 || s.indexOf( '.gif' ) > 0; } );
sublinks = sublinks.filter( tumblr_blacklist_filter ); // Remove avatars and crap
sublinks = sublinks.map( image_standardizer ); // Clean up semi-dupes (e.g. same image in different sizes -> same URL)
sublinks = sublinks.filter( novelty_filter ); // Global duplicate remover
// Oh. Photosets sort of just... work? That might not be reliable; DownThemAll acts like it can't see the iframes on some themes.
// Yep, they're there. Gonna be hard to notice if/when they fail. Oh well, "not all images are guaranteed to appear."
// Videos will still be weird. (But it does grab their preview thumbnails.)
// Wait, can I filter reblogs here? E.g. with a ?noreblogs flag, and then checking if any given post has via/source links. Hmm. Might be easier in /mobile pages.
// Seem to get a lot of duplicate images? e.g. both
// https://media.tumblr.com/tumblr_m2gktkD7u31qdcy3io1_640.jpg and
// https://media.tumblr.com/tumblr_m2gktkD7u31qdcy3io1_1280.jpg
// Oh! Do I just not handle _640?
// Get ID from post URL, e.g. http//example.tumblr.com/post/12345/title => 12345
let post_id = parseInt( link.split( '/' )[4] ); // 12345 as a NUMBER, not a string, doofus
if( sublinks.length > 0 ) { // If this post has images we're displaying -
let this_post = new String;
sublinks.map( url => {
this_post += '
';
this_post += '';
this_post += '
';
this_post += 'Permalink ';
} )
display_post( this_post, post_id );
}
} )
} )
}
// Place content on page in descending order according to post ID number
// Consider rejiggering the old scrape method to use this. Move to 'universal' section if so. Alter or spin off to link posts instead?
// Turns out I never implemented ?chrono or ?reverse, so nevermind that for now.
// Remember to set options_map.chrono if ?find contains /chrono or whatever.
function display_post( content, post_id ) {
let this_node = document.createElement( "span" );
this_node.innerHTML = content;
this_node.id = post_id
// Find lower-numbered node than post_id
let target_id = posts_placed.filter( n => n <= post_id ).sort( ).pop(); // Take the highest number less than (or equal to) post_id
if( options_map.find.indexOf( '/chrono' ) > 0 ) {
target_id = posts_placed.filter( n => n <= post_id ).sort( ).shift(); // Take the... fuck... lowest? What am I doing again?
// Fuuuck, this is really inconsistent. Nevermind the looney-toons syntax I used here, =>n<=.
// Screw it, use the old scraper for now.
}
let target_node = document.getElementById( target_id );
// http://stackoverflow.com/questions/4793604/how-to-do-insert-after-in-javascript-without-using-a-library
target_node.parentNode.insertBefore( this_node, target_node ); // Insert our span after the lower-ID node
posts_placed.push( post_id ); // Remember that we added this ID
// No return value
}
// Return ascending or descending order depending on "chrono" setting
// function post_order_sort( a, b )
// ------------------------------------ Specific handling for www.tumblr.com (tag search, possibly dashboard) ------------------------------------ //
// URLs like https://www.tumblr.com/tagged/wooloo?before=1560014505 don't follow easy sequential pagination, so we have to (a) be linear or (b) guess. First whack is (a).
// Tumblr dashboard is obvious standardized, so we can make assumptions about /post links in relation to images.
// We do not fetch any individual pages. We can't. They're on different subdomains, and Tumblr CORS remains tight-assed. But we can link them like in the post scrape mode.
// Ooh, I could maybe get "behind the jump" content via /embed URLs. Posts here should contain the blog UUID and a post number.
// Copied notes from above:
// Apparently this works on https://www.tumblr.com/search/homestuck/, but requires pages like https://www.tumblr.com/search/homestuck/post_page/1 etc.
// /post_page/2 still starts on the same images. so does /10.
// fetch( '/search/homestuck/post_page/10' ).then( n => n.text() ).then( n => console.log( n ) ) - also returns the same HTML.
// After much console dickery, I can confirm all of the images are the same between pages.
// https://www.tumblr.com/search/homestuck/post_page/2/amp does not work. https://www.tumblr.com/search/homestuck/post_page/2/mobile does not work.
// I guess I should @exclude www.tumblr.com. Or at least /search.
// On the other hand, https://www.tumblr.com/tagged/homestuck sort of works. Trouble is, pages go https://www.tumblr.com/tagged/homestuck?before=1558723097 - ugh.
// I thought ?before was a /post number, that page has e.g. https://jadeharleyworksatpetco.tumblr.com/post/185111936952/hello-human-i-require-cat-food - not even close.
// Next page is https://www.tumblr.com/tagged/homestuck?before=1558720051 - yeah this is a Unix timestamp. It's epoch time.
// I could fetch each page and then get the "next" link, but consider: I don't wanna.
// https://www.tumblr.com/dashboard/2/185117399018 - but this one -is- based on /post numbers. Tumblr.com: given two choices, take all three.
// Being able to scrape and archive site-wide tags or your own dashboard would be useful. Dammit.
// I might do a separate script, because overhauling this one to support a whole different URL structure (or two or three) and linearly chain fetches... ugh. It's a big ask.
// No, it'd be easier to roll another mode into this script. It's already a hot mess. The new code just won't run on individual blogs.
// Okay, so dashboard and site-wide tag modes.
// Dashboard post numbers don't have to be real post numbers. Tag-search timestamps obviously don't have to relate to real posts.
// We de-dupe, so overkill is fine... ish. Tumblr's touchy about "rate limit exceeded" these days.
// Tag scrape would be suuuper useful if we can grab blog posts from www.tumblr.com. Like "hiveswapcomicscontest."
// Test by setting scrape results to outerhtml.
// fetch( '/tagged/homestuck' ).then( r => r.text() ).then( t => console.log( t ) )
// fetch( '/tagged/homestuck' ).then( r => r.text() ).then( t => document.body.outerHTML = t ) - works.
// fetch( '/blog/sometipsygnostalgic' ).then( r => r.text() ).then( t => document.body.outerHTML = t ) - just the dashboard. Damn.
// fetch( '/blog/sometipsygnostalgic' ).then( r => r.text() ).then( t => document.body.outerHTML = encodeURI( t ).split('tumblr.com').filter( x => x.indexOf('.jpg') > 0 ).join('\n') )
// Fucking about: fetch( '/blog/sometipsygnostalgic' ).then( r => r.text() ).then( t => document.body.outerHTML = encodeURI( t ).split('href').filter( x => x.indexOf('sometipsy') > 0 ).join('\n') )
// fetch( '/blog/sometipsygnostalgic' ).then( r => r.text() ).then( t => console.log( t.split( /[<>]/ ).filter( x => x.indexOf( 'sometipsy' ) > 0 ) ) )
// fetch( '/blog/sometipsygnostalgic' ).then( r => r.text() ).then( t => document.body.outerHTML = t.split( /[<>]/ ).filter( x => x.indexOf( 'sometipsy' ) > 0 ).join('
') ) - finally a useful approach, but still slim pickings.
// Okay for starters it redirects to https://www.tumblr.com/dashboard/blog/sometipsygnostalgic - try that.
// fetch( '/dashboard/blog/sometipsygnostalgic' ).then( r => r.text() ).then( t => document.body.outerHTML = t ) - still just the dashboard.
// So no luck on scraping dashboard-only blogs. Bluh.
// This is so aggravating. I can see the content, obviously. "View page source" just returns the dashboard source. But document.body.outerHTML contains the blog proper.
// No index of '/page'. Some 400K-ish indices for elements specific to visible posts, like image IDs and text.
// document.body.outerHTML.substring( 442500 ) - a few scripts at the bottom of the page, but they're bizarre. Their source is https://px.srvcs.tumblr.com/impixu plus a lengthy bit of high-entropy nonsense. Maybe a key, maybe a value, I don't know.
// Right after the post that was originally at the bottom - the post right before infinite scrolling added more shit - there's a bit like this:
//
// https://safe.txmblr.com/svc/embed/iframe/sometipsygnostalgic/185276376531 worked - other post numbers did not. Tumblr, why.
// Might be a referrer / cache thing? Ctrl+F5'd on the one that worked; still works. Fuck me, apparently.
// Might still be a referrer thing, but require an iframe with id= matching the #embed-etc crap. Or that's a backend value and we're fucked. I don't know.
// There's no instance of the next post's value in the page content. Though I guess it's probably some 'after' value? No unusual instances of that either.
// The page footer is some nonsense about knight-rider-container, knight-rider-loader, and knight-rider-bar.
// Okay so I finally just googled "tumblr infinite scrolling" and https://coderwall.com/p/k7sj-q/ridiculously-easy-infinite-scrolling-for-tumblr pointed me to http://static.tumblr.com/c7cvhvu/mCZmb3z94/is.js which is accessible but minified.
// Poking through. It does some URL-as-string shenanigans to get the /page and /tagged values.
// Basically just creates a div with the next /page. Arg.
// Consider /embed again:
// https://embed.tumblr.com/embed/post/4RwtewsxXp-k1ReCcdAgXg/185288559546?width=542&language=en_US&did=a5c973d33a43ace664986204d72d7739de31b614
// This works but provides no previous/next link. (We need the ID, but we can get it from www.tumblr.com, then redirect.)
// Using DaveJaders for testing. https://davejaders.tumblr.com/archive does not redirect, so we can use that for same-origin fetches. Does /page stuff work?
// fetch( '/' ).then( r => r.text() ).then( t => document.body.outerHTML = t ) - CORS failure. "The Same Origin Policy disallows reading the remote resource at https://www.tumblr.com/login_required/davejaders . (Reason: CORS header ‘Access-Control-Allow-Origin’ missing)." Fuck me again, apparently.
// We now go right here for any ?ezastumblrscrape URL on www.tumblr.com. Sloppy but functional. Page is clear but blank.
function scrape_www_tagged( ) {
// Standard-ish intial steps: clear page (handled), add controls, maybe change window title.
// We can't use truly standard prev/next navigation. Even officially, you only get "next" links. (Page count options should still work.)
let www_tagged_next = "
Next >>>";
let pages_www_tagged = "";
pages_www_tagged += "
10, ";
pages_www_tagged += "
5, ";
pages_www_tagged += "
1 - >>>";
mydiv.innerHTML += "
" + www_tagged_next + "
" + pages_www_tagged + "";
mydiv.innerHTML += "This mode is under development and subject to change.
";
document.getElementById("bottom_controls_div").innerHTML += "
" + pages_www_tagged + "
" + www_tagged_next + "";
mydiv.innerHTML += image_size_options() + "
";
mydiv.innerHTML += image_resolution_options() + "
";
// ?find=/tagged/whatever is already populated by existing scrape links, but any ?key=value stuff gets lost.
// (If no ?before=timestamp, fetch first page. ?before=0 works. Some max_int would be preferable for sorting. No exceptions needed, then.)
if( isNaN( options_map.before ) ) { options_map.before = 0; }
// if( options_map.thumbnails == null ) { options_map.thumbnails = "fixed_width"; } // Doesn't necessarily go here, but we don't set it otherwise.
// Fetch first page specified by ?before=timestamp.
let tagged_url = "" + options_map.find + "?before=" + options_map.before; // Relative URLs are guaranteed to be same-domain, even if they're garbage.
fetch( tagged_url, { credentials: 'include' } ).then( r => r.text() ).then( text => {
display_www_tagged( text, options_map.before, options_map.pagesatonce ); // ... pagesatonce gets set in the pre-amble, right? It should have a default.
// Optionally we could check here if options_map.before == 0 and instead send max_safe_integer.
} )
}
// Either we need a global variable for how many more pages per... page... or else I should pass a how_many_more value to this recursive function.
function display_www_tagged( content, timestamp, pages_left ) {
// First, grab the Next link - i.e. its ?before=timestamp value.
// let next_timestamp_index = content.lastIndexOf( '?before=' );
// let next_timestamp = content.substring( next_timestamp_index + 8, content.indexOf( next_timestamp_index, '"' ) ); // Untested
let next_timestamp = content.split( '?before=' ).pop().split( '"' ).shift(); // Neither efficient nor really simple, but readable and fast to write. Clunk.
if( pages_left > 1 ) { // If we're displaying more pages then fetch that and recurse.
let tagged_url = "" + options_map.find + "?before=" + next_timestamp; // Relative URLs are guaranteed to be same-domain, even if they're garbage.
// console.log( tagged_url );
fetch( tagged_url, { credentials: 'include' } ).then( r => r.text() ).then( text => {
display_www_tagged( text, next_timestamp, pages_left - 1 );
} )
} else { // Otherwise put that timestamp in our constructed Next link(s).
// I guess... get HTMLcollection of elements for "next" links, and change each one.
// Downside: links will only change once the last page is fetched. We could tack on a ?before for every fetch, but it would get silly. Right?
let next_links = Array.from( document.getElementsByClassName( 'www_next' ) ); // I'm not dealing with a live object unless I have to.
for( link of next_links ) { link.href += "?before=" + next_timestamp; }
}
// Insert div for this timestamp's page.
let new_div = document.createElement( 'span' ); // Span, because divs cause line breaks. Whoops.
new_div.id = "" + timestamp;
let target_node = document.getElementById( 'bottom_controls_div' );
target_node.parentNode.insertBefore( new_div, target_node ); // Insert each page before the footer.
let div_html = "";
// Separate page HTML by posts.
// At least the "li" elements aren't nested, so I can terminate the last one on "". Or... all of them.
let posts = content.split( '
' )[0] ); // Terminate last element at . Again, not great code, but clunk clunk clunk get it done.
// For each post:
for( post of posts ) {
// Extract images from each post.
let links = links_from_page( post );
links = links.map( image_standardizer );
links = links.filter( novelty_filter );
links = links.filter( tumblr_blacklist_filter );
// document.body.innerHTML += links.join( "
" ) + "
"; // Debug
// Separate the images.
let images = links.filter( s => s.indexOf( 'media.tumblr.com' ) > 0 ); // Note: this will exclude external images, e.g. embedded Twitter stuff.
// If this post has images:
if( images.length > 0 ) { // Build HTML xor insert div for each post, to display images.
// Get /post URL, including blog name etc.
let permalink = links.filter( s => s.indexOf( '.tumblr.com/post' ) > 0 )[0];
let post_html = "";
for( image of images ) {
post_html += '
';
post_html += '';
post_html += '
';
post_html += 'Permalink ';
}
div_html += post_html;
}
}
// Insert accumulated HTML into this div.
new_div.innerHTML = div_html;
}
// ------------------------------------ HTML-returning functions for duplication prevention ------------------------------------ //
// Return HTML for standard Previous / Next controls (<<< Previous - Next >>>)
function html_previous_next_navigation() {
let prev_next_controls = "";
if( options_map.startpage > 1 ) {
prev_next_controls += "
<<< Previous - ";
}
prev_next_controls += "
Next >>>";
return prev_next_controls;
}
// Return HTML for pages-at-once versions of previous/next page navigation controls (<<< 10, 5, 1 - 1, 5, 10 >>>)
function html_page_count_navigation() {
let prev_next_controls = "";
if( options_map.startpage > 1 ) { // <<< 10, 5, 1 -
prev_next_controls += "<<< ";
prev_next_controls += "
1, ";
prev_next_controls += "
5, ";
prev_next_controls += "
10 - ";
}
prev_next_controls += "
10, ";
prev_next_controls += "
5, ";
prev_next_controls += "
1 - ";
prev_next_controls += ">>>";
return prev_next_controls;
}
// Return HTML for image-size options (changes via CSS or via URL parameters)
// This used to work. It still works, in the new www mode I just wrote. What the fuck do I have to do for some goddamn onclick behavior?
// "Content Security Policy: The page’s settings blocked the loading of a resource at self (“script-src https://lalilalup.tumblr.com https://assets.tumblr.com/pop/ 'nonce-OTA4NjViZmE2MzZkYTFjMjM1OGZkZGM1MzkwYWU4NTA='”)." What the fuck.
// Jesus Christ, it might be yet again because of Tumblr's tightass settings:
// https://stackoverflow.com/questions/37298608/content-security-policy-the-pages-settings-blocked-the-loading-of-a-resource
// The function this still works in is on a "not found" page. The places it will not work are /archive pages.
// Yeah, on /mobile instead of /archive it works fine. Fuck you, Tumblr.
// Jesus, that means even onError can't work.
function image_size_options() {
var html_string = "Immediate: \t"; // Change class to instantly resize images, temporarily
html_string += "
Original image sizes - ";
html_string += "
Snap columns - ";
html_string += "
Snap rows - ";
html_string += "
Fit width - ";
html_string += "
Fit height - ";
html_string += "
Fit both";
html_string += "Persistent: \t"; // Reload page with different image mode that will stick for previous/next pages
html_string += "
Original image sizes - "; // This is the CSS default, so any other value works
html_string += "
Snap columns - ";
html_string += "
Snap rows - ";
html_string += "
Fit width - ";
html_string += "
Fit height - ";
html_string += "
Fit both";
return html_string;
}
// Return HTML for links to ?maxres versions of the same page, e.g. "_raw" versus "_1280"
function image_resolution_options() {
var html_string = "Maximum resolution: \t";
html_string += "
Raw - "; // I'm not 100% sure "_raw" works anymore, but the error function handles it, so whatever.
html_string += "
1280 - ";
html_string += "
500 - ";
html_string += "
400 - ";
html_string += "
250 - ";
html_string += "
100";
return html_string;
}
// Return links to other parts of Eza's Tumblr Scrape functionality, possibly excluding whatever you're currently doing
// Switch to full-size images - Toggle image size - Show one page at once - Scrape whole Tumblr - (Experimental fetch-every-post image browser)
// This mode is under development and subject to change. - Return to original image browser
function html_ezastumblrscrape_options() {
let html_string = "";
// "You are browsing" text? Tell people where they are and what they're looking at.
html_string += "
Scrape whole Tumblr - ";
html_string += "
Browse images - "; // Default mode; so any value works
html_string += "
(Experimental fetch-every-post image browser) ";
return html_string;
}
function error_function( url ) {
// This clunky
![]()
function looks for a lower-res image if the high-res version doesn't exist.
// Surprisingly, this does still matter. E.g. http://66.media.tumblr.com/ba99a55896a14a2e083cec076f159956/tumblr_inline_nyuc77wUR01ryfvr9_500.gif
// This might mismatch _100 images and _250 links because of that self-erasing clause... but it's super rare, so meh.
let on_error = 'if(this.src.indexOf("_raw")>0){this.src=this.src.replace("_raw","_1280").replace("//media","//66.media");}'; // Swap _raw for 1280, add CDN number
on_error += 'else if(this.src.indexOf("_1280")>0){this.src=this.src.replace("_1280","_500");}'; // Swap 1280 for 500
on_error += 'else if(this.src.indexOf("_500")>0){this.src=this.src.replace("_500","_400");}'; // Or swap 500 for 400
on_error += 'else if(this.src.indexOf("_400")>0){this.src=this.src.replace("_400","_250");}'; // Or swap 400 for 250
on_error += 'else{this.src=this.src.replace("_250","_100");this.onerror=null;}'; // Or swap 250 for 100, then give up
on_error += 'document.getElementById("' + encodeURI( url ) + '").href=this.src;'; // Link the image to itself, regardless of size
return on_error;
}
// ------------------------------------ Universal page-scraping function (and other helper functions) ------------------------------------ //
// Add URLs from a 'blank' page to page_dupe_hash (without just calling soft_scrape_page_promise and ignoring its results)
function exclude_content_example( url ) {
fetch( url, { credentials: 'include' } ).then( r => r.text() ).then( text => {
let links = links_from_page( text );
links = links.filter( novelty_filter ); // Novelty filter twice, because image_standardizer munges some /post URLs
links = links.map( image_standardizer );
links = links.filter( novelty_filter );
} )
// No return value
}
// Spaghetti to reduce redundancy: given a page's text, return a list of URLs.
function links_from_page( html_copy ) {
// Cut off the page at the "More you might like" / "Related posts" footer, on themes that have one
html_copy = html_copy.split( '="related-posts' ).shift();
let http_array = html_copy.split( /['="']http/ ); // Regex split on anything that looks like a source or href declaration
http_array.shift(); // Ditch first element, which is just etc.
http_array = http_array.map( s => { // Theoretically parallel .map instead of maybe-linear .forEach or low-level for() loop
if( s.indexOf( "&" ) > -1 ) { s = htmlDecode( s ); } // Yes a fucking " should match a goddamn regex for terminating on quotes!
s = s.split( /['<>"']/ )[0]; // Terminate each element (split on any terminator, take first subelement)
s = s.replace( /\\/g, '' ); // Remove escaping backslashes (e.g. http\/\/ -> http//)
if( s.indexOf( "%3A%2F%2F" ) > -1 ) { s = decodeURIComponent( s ); } // What is with all the http%3A%2F%2F URLs?
// s = s.split( '"' )[0]; // Yes these count as doublequotes you stupid broken scripting language.
return "http" + s; // Oh yeah, add http back in (regex eats it)
} ) // http_array now contains an array of strings that should be URLs
let post_array = html_copy.split( /['="']\/post/ ); // Regex split on anything that defines looks similar to a src="/post" link
post_array.shift(); // Ditch first element, which is just etc.
post_array = post_array.map( s => { // Theoretically parallel .map instead of maybe-linear .forEach or low-level for() loop
s = s.split( /['<>"']/ )[0]; // Terminate each element (split on any terminator, take first subelement)
return window.location.protocol + "//" + window.location.hostname + "/post" + s; // Oh yeah, add /post back in (regex eats it)
} ) // post_array now contains an array of strings that should be photoset URLs
http_array = http_array.concat( post_array ); // Photosets are out of order again. Blar.
return http_array;
}
// Filter: Return false for typical Tumblr nonsense (JS, avatars, RSS, etc.)
function tumblr_blacklist_filter( url ) {
if( url.indexOf( "/reblog/" ) > 0 ||
url.indexOf( "/tagged/" ) > 0 || // Might get removed so the script can track and report tag use. Stupid art tags like 'my-draws' or 'art-poop' are a pain to find.
url.indexOf( ".tumblr.com/avatar_" ) > 0 ||
url.indexOf( ".tumblr.com/image/" ) > 0 ||
url.indexOf( ".tumblr.com/rss" ) > 0 ||
url.indexOf( "srvcs.tumblr.com" ) > 0 ||
url.indexOf( "assets.tumblr.com" ) > 0 ||
url.indexOf( "schema.org" ) > 0 ||
url.indexOf( ".js" ) > 0 ||
url.indexOf( ".css" ) > 0 ||
url.indexOf( "twitter.com/intent" ) > 0 || // Weirdly common now
url.indexOf( "tmblr.co/" ) > 0 ||
url.indexOf( "ezastumblrscrape" ) > 0 ) // Somehow this script is running on pages being fetched, inserting a link. Okay. Sure.
{ return false } else { return true }
}
// Return standard canonical URL for various resizes of Tumblr images - size of _1280, single CDN
// 10/14 - ?usesmall seems to miss the CDN sometimes?
// e.g. http://mooseman-draws.tumblr.com/archive?startpage=1?pagesatonce=5?thumbnails?ezastumblrscrape?scrapemode=everypost?lastpage=37?usesmall
// https://66.media.tumblr.com/d970fff86185d6a51904e0047de6e764/tumblr_ookdvk7foy1tf83r7o1_400.png sometimes redirects to 78.media and _raw. What?
// Oh, probably not my script. I fucked with the Tumblr redirect script I use, but didn't handle the lack of CDN in _raw sizes.
function image_standardizer( url ) {
// Some lower-size images are automatically resized. We'll change the URL to the maximum size just in case, and Tumblr will provide the highest resolution.
// Replace all resizes with _1280 versions. Nearly all _1280 URLs resolve to highest-resolution versions now, so we don't need to e.g. handle GIFs separately.
// Oh hey, Tumblr now has _raw for a no-bullshit as-large-as-possible setting.
// _raw only works without the CDN - so //media.tumblr yes, but //66.media.tumblr no. This complicates things.
// Does //media and _raw always work? No, of course not. So we still need on_error.
// url = url.replace( "_540.", "_1280." ).replace( "_500.", "_1280." ).replace( "_400.", "_1280." ).replace( "_250.", "_1280." ).replace( "_100.", "_1280." );
let maxres = "1280"; // It is increasingly unlikely that _raw still works. Reconsider CDN handling if that's the case.
if( options_map.maxres ) { maxres = options_map.maxres } // If it's set, use it. Should be _100, _250, whatever. ?usesmall should set it to _400. ?notraw, _1280.
maxres = "_" + maxres + "."; // Keep the URL options clean: "400" instead of "_400." etc.
url = url.replace( "_raw", maxres ).replace( "_1280.", maxres ).replace( "_640.", maxres ).replace( "_540.", maxres )
.replace( "_500.", maxres ).replace( "_400.", maxres ).replace( "_250.", maxres ).replace( "_100.", maxres );
// henrythehangman.tumblr.com has doubled images from /image posts in ?scrapemode=everypost. Lots of _1280.jpg?.jpg nonsense.
// Is that typical for tumblrs with this theme? It's one of those annoying magnifying-glass-on-hover deals. If it's just that one weird fetish site, remove this later.
url = url.split('?')[0]; // Ditch anything past the first question mark, if one exists
url = url.split('&')[0]; // Ditch anything past the first ampersand, if one exists - e.g. speikobrarote.tumblr.com
if( url.indexOf( 'tumblr.com' ) > 0 ) { url = url.split( ' ' )[0]; } // Ditch anything past a trailing space, if one exists - e.g. cinnasmut.tumblr.com
// Standardize media subdomain / CDN subsubdomain, to prevent duplicates and fix _1280 vs _raw complications.
if( url.indexOf( '.media.tumblr.com/' ) > 0 ) {
let url_parts = url.split( '/' )
url_parts[2] = '66.media.tumblr.com'; // This came first. Then //media.tumblr.com worked, even for _raw. Then _raw went away. Now it needs a CDN# again. Bluh.
// url_parts[2] = 'media.tumblr.com'; // 2014: write a thing. 2016: comment out old thing, write new thing. 2018: uncomment old thing, comment new thing. This script.
url = url_parts.join( '/' ).replace( 'http:', 'https:' );
}
/* // ?notraw and ?usesmall are deprecated. Use ?maxres=1280 or ?maxres=400 instead.
// Change back to _1280 & CDN for ?scrapewholesite (which does no testing). _raw is unreliable.
if( options_map.scrapemode == 'scrapewholesite' || options_map.notraw ) {
url = url.replace( "_raw.", "_1280." ).replace( '//media', '//66.media' );
}
// Change to something smaller for quick browsing, like _500 or _250
// https://78.media.tumblr.com/166565b897f228352069b290067215c0/tumblr_oozoucFu0O1v1bsoxo2_raw.jpg etc don't work. What?
if( options_map.scrapemode != 'scrapewholesite' && options_map.usesmall ) { // Ignore this on scrapewholesite; that'd be a dumb side effect.
url = url.replace( "_raw.", "_400." ).replace( "_1280.", "_400." ).replace( '//media', '//66.media' );
}
*/
return url;
}
// Remove duplicates from an array (from an iterable?) - returns de-duped array
// Credit to http://stackoverflow.com/questions/9229645/remove-duplicates-from-javascript-array for hash-based string method
function remove_duplicates( list ) {
let seen = {};
list = list.filter( function( item ) {
return seen.hasOwnProperty( item ) ? false : ( seen[ item ] = true );
} );
return list;
}
// Filter: Return true ONCE for any given string.
// Global duplicate remover - return false for items found in page_dupe_hash, otherwise add new items to it and return true
// Now also counts instances of each non-unique argument
function novelty_filter( url ) {
// return page_dupe_hash.hasOwnProperty( url ) ? false : ( page_dupe_hash[ url ] = true );
// console.log( page_dupe_hash ); // DEBUG
url = url.toLowerCase(); // Debug-ish, mostly for "tag overview." URLs can be case-sensitive but collisions will be rare. "Not all images are guaranteed to appear."
if( page_dupe_hash.hasOwnProperty( url ) ) {
page_dupe_hash[ url ] += 1;
return false;
} else {
page_dupe_hash[ url ] = 1;
return true;
}
}
// Filter: Return true for any Tumblr /tagged URL.
// This is used separately from the other filters. Those go X=X.filter( remove stuff ), X=X.filter( remove other stuff ). This should run first, like Y=X.filter( return_tags ).
function return_tags( url ) {
// return url.indexOf( '/tagged' ) > 0; // 'if( condition ) { return true } else { return false }' === return condition.
if( url.indexOf( '/tagged' ) > 0 ) { return true; } else { return false; }
}
// Viewing a bare image, redirect to largest available image size.
// // e.g. https://66.media.tumblr.com/d020ac15b0e04ff9381f246ed08c9f05/tumblr_o2lok9yf8O1ub76b5o1_1280.jpg
// This is a sloppy knockoff of Tumblr Image Size 1.1, because that script doesn't affect //media.tumblr.com URLs.
// Also, this is a distinct mode, not a helper function. Maybe just move it up to the start?
function maximize_image_size() {
if( window.location.href.indexOf( "_1280." ) > -1 ) { return false; } // If it's already max-size, die.
if( window.location.href.indexOf( "_raw." ) > -1 ) { return false; } // If it's already max-size, die.
// Nope, still broken. E.g. https://78.media.tumblr.com/tumblr_mequmpAxgp1r9tb2u.gif
// Needs a positive test for all the replacements... or a check if this URL is different from the changed URL.
// This should probably use image_standardizer to avoid duplicate code.
let replacement = image_standardizer( window.location.href );
if( window.location.href != replacement ) { window.location.href = replacement; }
/*
let maxres = "_1280.";
// maxres = "_" + maxres + "."; // Keep the URL options clean: "400" instead of "_400." etc.
url = window.location.href;
url = url.replace( "_1280.", maxres ).replace( "_540.", maxres ).replace( "_500.", maxres ).replace( "_400.", maxres ).replace( "_250.", maxres ).replace( "_100.", maxres );
if( window.location.href != url ) { window.location.href = url; }
*/
}
// Decode entity references.
// This is officially the stupidest polyfill I've ever used. How is there an entire class of standard escaped text with NO standard decoder in Javascript?
// Copied straight from https://stackoverflow.com/questions/1912501/unescape-html-entities-in-javascript because decency was not an option.
function htmlDecode(input) {
return new DOMParser().parseFromString(input, "text/html").documentElement.textContent;
}
// Given the bare HTML of a Tumblr page, return an array of Promises for image/video/link URLs
function soft_scrape_page_promise( html_copy ) {
// Linear portion:
let http_array = links_from_page( html_copy ); // Split bare HTML into link and image sources
http_array.filter( url => url.indexOf( '/tagged/' ) > 0 ).filter( novelty_filter ); // Track tags for statistics, before the blacklist removes them
http_array = http_array.filter( tumblr_blacklist_filter ); // Blacklist filter for URLs - typical garbage
function is_an_image( url ) {
// Whitelist URLs with image file extensions or Tumblr iframe indicators
var image_link = false;
if( url.indexOf( ".gif" ) > 0 ) { image_link = true; }
if( url.indexOf( ".jpg" ) > 0 ) { image_link = true; }
if( url.indexOf( ".jpeg" ) > 0 ) { image_link = true; }
if( url.indexOf( ".png" ) > 0 ) { image_link = true; }
if( url.indexOf( "/photoset_iframe" ) > 0 ) { image_link = true; }
if( url.indexOf( ".tumblr.com/video/" ) > 0 ) { image_link = true; }
if( url.indexOf( "/audio_player_iframe/" ) > 0 ) { image_link = true; }
return image_link;
}
// Separate the images
http_array = http_array.map( url => {
if( is_an_image( url ) ) { // If it's an image, get rid of any Tumblr variability about resolution or CDNs, to avoid duplicates with nonmatching URLs
return image_standardizer( url );
} else { // Else if not an image
if( url.indexOf( window.location.host ) > 0 ) { url += "#local" } else { url += "#offsite" } // Mark in-domain vs. out-of-domain URLs.
if( options_map.imagesonly ) { return ""; } // ?imagesonly to skip links on ?scrapewholesite
return url + "#link";
}
} )
.filter( n => { // Remove all empty strings, where "empty" can involve a lot of #gratuitous #tags.
if( n.split("#")[0] === "" ) { return false } else { return true }
} );
http_array = remove_duplicates( http_array ); // Remove duplicates within the list
http_array = http_array.filter( novelty_filter ); // Remove duplicates throughout the page
// Should this be skipped on scrapewholesite? Might be slowing things down.
// Async portion:
// Return promise that resolves to list of URLs, including fetched videos and photoset sub-images
return Promise.all( http_array.map( s => {
if( s.indexOf( '/photoset_iframe' ) > 0 ) { // If this URL is a photoset, return a promise for an array of URLs
return fetch( s, { credentials: 'include' } ).then( r => r.text() ).then( text => { // Fetch URL, get body text from response
var photos = text.split( 'href="' ); // Isolate photoset elements from href= declarations
photos.shift(); // Get rid of first element because it's everything before the first "href"
photos = photos.map( p => p.split( '"' )[0] + "#photoset" ); // Tag all photoset images as such, just because
photos[0] += "#" + s; // Tag first image in set with photoset URL so browse mode can link to it
return photos;
} )
}
else if ( s.indexOf( '.tumblr.com/video/' ) > 0 ) { // Else if this URL is an embedded video, return a Tumblr-standard URL for the bare video file
var subdomain = s.split( '/' ); // E.g. https://www.tumblr.com/video/examplename/123456/500/ -> https,,www.tumblr.com,video,examplename,123456,500
var video_post = window.location.protocol + "//" + subdomain[4] + ".tumblr.com/post/" + subdomain[5] + "/";
// e.g. http://examplename.tumblr.com/post/123456/ - note window.location.protocol vs. subdomain[0], maintaining http/https locally
return fetch( video_post, { credentials: 'include' } ).then( r => r.text() ).then( text => {
if( text.indexOf( 'og:image' ) > 0 ) { // property="og:image" content="http://67.media.tumblr.com/tumblr_123456_frame1.jpg" --> tumblr_123456_frame1.jpg
var video_name = text.split( 'og:image' )[1].split( 'media.tumblr.com' )[1].split( '"' )[0].split( '/' ).pop();
} else if( text.indexOf( 'poster=' ) > 0 ) { // poster='https://31.media.tumblr.com/tumblr_nuzyxqeJNh1rjoppl_frame1.jpg'
var video_name = text.split( "poster='" )[1].split( 'media.tumblr.com' )[1].split( "'" )[0].split( '/' ).pop(); // Bandaid solution. Tumblr just sucks.
} else {
return video_post + '#video'; // Current methods miss the whole page if these splits miss, so fuck it, just return -something.-
}
// tumblr_abcdef12345_frame1.jpg -> tumblr_abcdef12345.mp4
video_name = "tumblr_" + video_name.split( '_' )[1] + ".mp4#video";
video_name = "https://vt.tumblr.com/" + video_name; // Standard Tumblr-wide video server
return video_name; // Should be e.g. https://vt.tumblr.com/tumblr_abcdef12345.mp4
} )
}
else if ( s.indexOf( "/audio_player_iframe/" ) > 0 ) { // Else if this URL is an embedded audio file, return... well not a standard URL perhaps, but an URL.
// How the fuck do I download audio? Video works-ish. Audio is not well-supported.
// http://articulatelydecomposed.tumblr.com/post/176171225450/you-know-i-had-to-do-it-the-new-friendsim-had-a
// Ctrl+F "plays". Points to:
// http://articulatelydecomposed.tumblr.com/post/176171225450/audio_player_iframe/articulatelydecomposed/tumblr_pcaffm6o6H1rc8keu?audio_file=https%3A%2F%2Fwww.tumblr.com%2Faudio_file%2Farticulatelydecomposed%2F176171225450%2Ftumblr_pcaffm6o6H1rc8keu&color=white&simple=1
// Still no bare .mp3 link in that iframe.
// data-stream-url="https://www.tumblr.com/audio_file/articulatelydecomposed/176171225450/tumblr_pcaffm6o6H1rc8keu" ?
// Indeed. Actually that might be in the iframe link. What's that function for decoding URIs?
// Wait, the actual file resolves to https://a.tumblr.com/tumblr_pcaffm6o6H1rc8keuo1.mp3 - this is trivial.
// let audio_name = s.split( '/' ).pop(); // Ignore text before last slash.
// audio_name = audio_name.split( '?' ).shift(); // Ignore text after a question mark, if there is one.
// audio_name should now look something like 'tumblr_12345abdce'.
// return 'https://a.tumblr.com/' + audio_name + '1.mp3#FUCK'; // Standard tumblr-wide video server. Hopefully.
// Inconsistent.
// e.g. http://articulatelydecomposed.tumblr.com/post/176307067285/mintchocolatechimp-written-by-reddit-user shows
// https://a.tumblr.com/tumblr_pce3snfthB1ufch4go1.mp3#offsite#link which works, but also
// https://a.tumblr.com/tumblr_pce3snfthB1ufch4go1.mp3&color=white&simple=1#offsite#link which doesn't and
// https://a.tumblr.com/tumblr_pce3snfthB1ufch4g1.mp3 which also doesn't.
// ... both of the 'go1' links are present even without this iframe-based code, because that audio has a 'download' link.
// Yeah, so much for the simple answer. Fetch. No URL processing seems necessary - 's' is already on this domain.
return fetch( s, { credentials: 'include' } ).then( r => r.text() ).then( text => {
// data-stream-url="https://a.tumblr.com/tumblr_pce3snfthB1ufch4go1.mp3"
let data_url = text.split( 'data-stream-url="' )[1]; // Drop everything before the file declaration.
data_url = data_url.split( '"' )[0]; // Drop everything after the doublequote. Probably not efficient, but fuck regexes.
return data_url + "#audio";
} )
// Alright, well now it sort of works, but sometimes it returns e.g.
// https://www.tumblr.com/audio_file/articulatelydecomposed/176010754365/tumblr_pc1pq1TsD31rc8keu#FRIG which resolves to
// https://a.tumblr.com/tumblr_pc1pq1TsD31rc8keuo1.mp3#FRIG
// Huh. That -is- the correctly-identified "data-stream-url" in some iframes. Shit.
// This is set up to handle multiple return values, right? For photosets? So I -could- return the correct URL and some guesses.
// But at that point - why not just guess from the iframe URL? +1.mp3 and also +o1.mp3.
// Can't just make up a username or post number for the /audio_file sort of URL.
}
return Promise.resolve( [s] ); // Else if this URL is singular, return a single element... resolved as a promise for Promise.all, in an array for Array.concat. Whee.
} ) )
.then( nested_array => { // Given the Promise.all'd array of resolved URLs and URL-arrays
return [].concat.apply( [], nested_array ); // Concatenate array of arrays - apply turns array into comma-separated list, concat turns CSL of arrays into a single array
} )
}
// Returns a URL with all the options_map options in ?key=value format - optionally allowing changes to options in the returned URL
// Valid uses:
// options_url() -> all current settings, no changes
// options_url( "name", number ) -> ?name=number
// options_url( "name", true ) -> ?name
// options_url( {name:number} ) -> ?name=number
// options_url( {name:number, other:true} ) -> ?name=number?other
// Note that simply passing "name" will remove ?name, not add it, because the value will evaluate false. I should probably change this? Eh, { key } without :value causes errors.
function options_url( key, value ) {
var copy_map = new Object();
for( var i in options_map ) { copy_map[ i ] = options_map[ i ]; }
// In any sensible language, this would read "copy_map = object_map." Javascript genuinely does not know how to copy objects. Fuck's sake.
if( typeof key === 'string' ) { // the parameters are optional. just calling options_url() will return e.g. example.tumblr.com/archive?ezastumblrscrape?startpage=1
if( !value ) { value = false; } // if there's no value then use false
copy_map[ key ] = value; // change this key, so we can e.g. link to example.tumblr.com/archive?ezastumblrscrape?startpage=2
}
else if( typeof key === 'object' ) { // If we're passed a hashmap
for( var i in key ) {
if( ! key[ i ] ) { key[ i ] = false; } // Turn any false evaluation into an explicit boolean - this might not be necessary
copy_map[ i ] = key[ i ]; // Press key-object values onto copy_map-object values
}
}
// Construct URL from options
var base_site = window.location.href.substring( 0, window.location.href.indexOf( "?" ) ); // should include /archive, but if not, it still works on most pages
for( var k in copy_map ) { // JS maps are weird. We're actually setting attributes of a generic object. So map[ "thumbnails" ] is the same as map.thumbnails.
if( copy_map[ k ] ) { // Unless the value is False, print a ?key=value pair.
base_site += "?" + k;
if( copy_map[ k ] !== true ) { base_site += "=" + copy_map[ k ]; } // If the value is boolean True, just print the value as a flag.
}
}
return base_site;
}