// ==UserScript== // @name HIT Scraper (classic version) // @author Kerek // @description Snag HITs. // Based in part on code from mmmturkeybacon Export Mturk History and mmmturkeybacon Color Coded Search with Checkpoints // @namespace http://userscripts.org/users/536998 // @match https://www.mturk.com/mturk/findhits?match=true#hit_scraper* // @match https://www.mturk.com/mturk/findhits?match=true?hit_scraper* // @version 1.3.0.1 // @grant GM_xmlhttpRequest // @grant GM_getValue // @grant GM_setValue // @require http://code.jquery.com/jquery-latest.min.js // @downloadURL none // ==/UserScript== //alter the requester ignore last as you desire, just follow the format below and use EXACT capitalization e.g., CrowdSource var ignore_list = ["Oscar Smith", "Jon Brelig"]; //this searches extra pages if you skip too much, helps fill out results if you hit a chunk of ignored HITs. Change to true for this behavior. var correct_for_skips = false; //weight the four TO ratings for the coloring. Default has pay twice as important as fairness and nothing for communication and fast. var COMM_WEIGHT = 0; var PAY_WEIGHT = 10; var FAIR_WEIGHT = 5; var FAST_WEIGHT = 0; //display your hitdb records if applicable var check_hitDB = true; //default text size var default_text_size=11; var HITStorage = {}; var indexedDB = window.indexedDB || window.webkitIndexedDB || window.mozIndexedDB; window.IDBTransaction = window.IDBTransaction || window.webkitIDBTransaction || window.mozIDBTransaction; window.IDBKeyRange = window.IDBKeyRange || window.webkitIDBKeyRange || window.mozIDBKeyRange; HITStorage.IDBTransactionModes = { "READ_ONLY": "readonly", "READ_WRITE": "readwrite", "VERSION_CHANGE": "versionchange" }; var IDBKeyRange = window.IDBKeyRange; HITStorage.indexedDB = {}; HITStorage.indexedDB = {}; HITStorage.indexedDB.db = null; HITStorage.indexedDB.onerror = function(e) { console.log(e); }; var v=4; HITStorage.indexedDB.checkTitle = function(title,button) { var request = indexedDB.open("HITDB", v); request.onsuccess = function(e) { HITStorage.indexedDB.db = e.target.result; var db = HITStorage.indexedDB.db; if (!db.objectStoreNames.contains("HIT")) { db.close(); return; } var trans = db.transaction(["HIT"], HITStorage.IDBTransactionModes.READ_ONLY); var store = trans.objectStore("HIT"); var index = store.index("title"); index.get(title).onsuccess = function(event) { if (event.target.result === undefined) { console.log(title + ' not found'); history[button].titledb=false; } else { console.log(title + ' found'); history[button].titledb=true; } db.close(); }; }; request.onerror = HITStorage.indexedDB.onerror; }; HITStorage.indexedDB.checkRequester = function(id,button) { var request = indexedDB.open("HITDB", v); request.onsuccess = function(e) { HITStorage.indexedDB.db = e.target.result; var db = HITStorage.indexedDB.db; if (!db.objectStoreNames.contains("HIT")) { db.close(); return; } var trans = db.transaction(["HIT"], HITStorage.IDBTransactionModes.READ_ONLY); var store = trans.objectStore("HIT"); var index = store.index("requesterId"); index.get(id).onsuccess = function(event) { if (event.target.result === undefined) {history[button].reqdb=false; console.log(id + ' not found'); } else { history[button].reqdb=true; console.log(id + ' found'); } db.close(); }; }; request.onerror = HITStorage.indexedDB.onerror; }; var PAGES_TO_SCRAPE = 3; var MINIMUM_HITS = 100; var SEARCH_REFRESH=0; var URL_BASE = "/mturk/searchbar?searchWords=&selectedSearchType=hitgroups"; var initial_url = URL_BASE; var TO_REQ_URL = "http://turkopticon.ucsd.edu/reports?id="; var found_key_list=[]; var last_clear_time = new Date().getTime(); var searched_once = false; var save_new_results_time = 120; var save_results_time = 3600; var default_type = 0; var cur_loc = window.location.href; var time_input = document.createElement("INPUT"); time_input.value = 0; var page_input = document.createElement("INPUT"); page_input.value = 3; var min_input = document.createElement("INPUT"); var new_time_display_input = document.createElement("INPUT"); new_time_display_input.value = 300; var reward_input = document.createElement("INPUT"); var qual_input = document.createElement("INPUT"); qual_input.type = "checkbox"; qual_input.checked = true; var masters_input = document.createElement("INPUT"); masters_input.type = "checkbox"; var sort_input1 = document.createElement("INPUT"); sort_input1.type = "radio"; sort_input1.name = "sort_type"; sort_input1.value = "latest"; sort_input1.checked = true; var sort_input2 = document.createElement("INPUT"); sort_input2.type = "radio"; sort_input2.name = "sort_type"; sort_input2.value = "most"; var sort_input3 = document.createElement("INPUT"); sort_input3.type = "radio"; sort_input3.name = "sort_type"; sort_input3.value = "amount"; var search_input = document.createElement("INPUT"); var LINK_BASE = "https://www.mturk.com"; var BACKGROUND_COLOR = "rgb(19, 19, 19)"; var STATUSDETAIL_DELAY = 250; var MPRE_DELAY = 3000; var next_page = 1; var GREEN = '#66CC66'; // > 4 var LIGHTGREEN = '#ADFF2F'; // > 3 GREEN YELLOW var YELLOW = '#FFD700'; var ORANGE = '#FF9900'; // > 2 var RED = '#FF3030'; // <= 2 var BLUE = '#C0D9D9'; // no TO var GREY = 'lightGrey'; var BROWN = '#94704D'; var DARKGREY = '#9F9F9F'; $('body').css('background', BACKGROUND_COLOR); var API_PROXY_BASE = 'https://mturk-api.istrack.in/'; var API_MULTI_ATTRS_URL = API_PROXY_BASE + 'multi-attrs.php?ids='; var REVIEWS_BASE = 'http://turkopticon.ucsd.edu/'; var control_panel_HTML = '
'; $('body > :not(#control_panel)').hide(); //hide all nodes directly under the body $('body').prepend(control_panel_HTML); var control_panel = document.getElementById("control_panel"); var big_red_button = document.createElement("BUTTON"); var progress_report = document.createTextNode("Stopped"); var text_area = document.createElement("TABLE"); big_red_button.textContent = "Show Interface"; big_red_button.onclick = function(){show_interface();}; control_panel.appendChild(big_red_button); show_interface(); var global_run = false; var statusdetail_loop_finished = false; var date_header = ""; var history = {}; var wait_loop; function set_progress_report(text, force) { if (global_run == true || force == true) { progress_report.textContent = text; } } function get_progress_report() { return progress_report.textContent; } function wait_until_stopped() { if (global_run == true) { if (statusdetail_loop_finished == true) { big_red_button.textContent = "Start"; set_progress_report("Finished", false); } else { setTimeout(function(){wait_until_stopped();}, 500); } } } function display_wait_time(wait_time) { if (global_run == true) { var current_progress = get_progress_report(); if (current_progress.indexOf("Searching again in")!==-1) { set_progress_report(current_progress.replace(/Searching again in \d+ seconds/ , "Searching again in " + wait_time + " seconds"),false); } else set_progress_report(current_progress + " Searching again in " + wait_time + " seconds.", false); if (wait_time>1) setTimeout(function(){display_wait_time(wait_time-1);}, 1000); } } function dispArr(ar) { var disp = ""; for (var z = 0; z < ar.length; z++) { disp += "id " + z + " is " + ar[z] + " "; } console.log(disp); } function scrape($src) { var $requester = $src.find('a[href^="/mturk/searchbar?selectedSearchType=hitgroups&requester"]'); var $title = $src.find('a[class="capsulelink"]'); var $reward = $src.find('span[class="reward"]'); var $preview = $src.find('a[href^="/mturk/preview?"]'); var $qualified = $src.find('a[href^="/mturk/notqualified?"]'); var not_qualified_group_IDs=[]; $qualified.each(function(){ var groupy = $(this).attr('href'); groupy = groupy.replace("/mturk/notqualified?hitId=",""); not_qualified_group_IDs.push(groupy); }); var $mixed = $src.find('a[href^="/mturk/preview?"],a[href^="/mturk/notqualified?"]'); var listy =[]; $mixed.each(function(){ var groupy = $(this).attr('href'); groupy = groupy.replace("/mturk/notqualified?hitId=",""); groupy = groupy.replace("/mturk/preview?groupId=",""); listy.push(groupy); }); listy = listy.filter(function(elem, pos) { return listy.indexOf(elem) == pos; }); for (var j = 0; j < $requester.length; j++) { var $hits = $requester.eq(j).parent().parent().parent().parent().parent().parent().find('td[class="capsule_field_text"]'); var requester_name = $requester.eq(j).text().trim(); var requester_link = $requester.eq(j).attr('href'); var group_ID=listy[j]; var preview_link = "/mturk/preview?groupId=" + group_ID; var title = $title.eq(j).text().trim(); var reward = $reward.eq(j).text().trim(); var hits = $hits.eq(4).text().trim(); var requester_id = requester_link.replace('/mturk/searchbar?selectedSearchType=hitgroups&requesterId=',''); var accept_link; accept_link = preview_link.replace('preview','previewandaccept'); key = requester_name+title+reward+group_ID; found_key_list.push(key); if (history[key] == undefined) { history[key] = {requester:"", title:"", reward:"", hits:"", req_link:"", prev_link:"", rid:"", acc_link:"", new_result:"", qualified:"", found_this_time:"", initial_time:"", reqdb:"",titledb:""}; history[key].req_link = requester_link; history[key].prev_link = preview_link; history[key].requester = requester_name; history[key].title = title; history[key].reward = reward; history[key].hits = hits; history[key].rid = requester_id; history[key].acc_link = accept_link; HITStorage.indexedDB.checkRequester(requester_id,key); HITStorage.indexedDB.checkTitle(title,key); if (searched_once) { history[key].initial_time = new Date().getTime();//-1000*(save_new_results_time - SEARCH_REFRESH); history[key].new_result = 0; } else { history[key].initial_time = new Date().getTime()-1000*save_new_results_time; history[key].new_result = 1000*save_new_results_time; } if (not_qualified_group_IDs.indexOf(group_ID)!==-1) history[key].qualified = false; else history[key].qualified = true; history[key].found_this_time = true; } else { history[key].new_result = new Date().getTime() - history[key].initial_time; history[key].found_this_time = true; history[key].hits = hits; } } } function statusdetail_loop(next_URL) { if (global_run == true) { if (next_URL.length != 0) { $.get(next_URL, function(data) { var $src = $(data); var maxpagerate = $src.find('td[class="error_title"]:contains("You have exceeded the maximum allowed page request rate for this website.")'); if (maxpagerate.length == 0) { set_progress_report("Processing page " + next_page, false); scrape($src); $next_URL = $src.find('a[href^="/mturk/viewsearchbar"]:contains("Next")'); next_URL = ($next_URL.length != 0) ? $next_URL.attr("href") : ""; next_page++; if (default_type == 1) { var hmin = MINIMUM_HITS+1; for (j = 0; j < found_key_list.length; j++) { if (history[found_key_list[j]].hits < hmin) { next_URL = ""; next_page = -1; break; } } } else if (next_page > PAGES_TO_SCRAPE && correct_for_skips) { var skipped_hits = 0; var added_pages = 0; for (j = 0; j < found_key_list.length; j++) { var obj = history[found_key_list[j]]; if (! ignore_check(obj.requester,obj.title)) skipped_hits++; } added_pages = Math.floor(skipped_hits/10); if (skipped_hits%10 >6) added_pages++; if (next_page > PAGES_TO_SCRAPE + added_pages) { next_URL = ""; next_page = -1; } } else if (next_page > PAGES_TO_SCRAPE) { next_URL = ""; next_page = -1; } setTimeout(function(){statusdetail_loop(next_URL);}, STATUSDETAIL_DELAY); } else { console.log("MPRE"); setTimeout(function(){statusdetail_loop(next_URL);}, MPRE_DELAY); } }); } else { searched_once = true; var found_hits = found_key_list.length; var shown_hits = 0; var new_hits = 0; var url = API_MULTI_ATTRS_URL; var rids = []; var lastRow = text_area.rows.length - 1; for (i = lastRow; i>0; i--) text_area.deleteRow(i); for (j = 0; j < found_key_list.length; j++) { var obj = history[found_key_list[j]]; if (ignore_check(obj.requester,obj.title) && obj.found_this_time){ ++shown_hits; var col_heads = ["" + obj.requester + "","" + obj.title + "",obj.reward,obj.hits,"TO down","Accept"]; var row = text_area.insertRow(text_area.rows.length); url += obj.rid + ','; rids.push(obj.rid); if (check_hitDB) { col_heads.push("R"); col_heads.push("T"); } if (!obj.qualified) { col_heads.push("Not Qualified"); } for (i=0; i