I’m obviously missing something here. Same project I’ve been working on for a number

Question

0

Asked: May 17, 20262026-05-17T22:44:44+00:00 2026-05-17T22:44:44+00:00

I’m obviously missing something here. Same project I’ve been working on for a number

0

I’m obviously missing something here. Same project I’ve been working on for a number of days. Stepping through it bit by bit, seemed to be working fine. I added in a portion of the main() function to actually create the comparison lists, and suddenly starts throwing out cannot pop from empty list error at me, even through a print function I’ve placed ahead of the pop() call clearly shows that the list is not empty? Any ideas what I’m doing wrong? and is this monstrosity gonna actually work the way I intend? First time working with threads and all. Here is the code in its entirety:

import urllib
import urllib2
import sys
from lxml.html import parse, tostring, fromstring
from urlparse import urlparse
import threading



class Crawler(threading.Thread):

 def __init__(self):
    self.links = []
    self.queue = []
    self.mal_list = []
    self.count = 0
    self.mal_set = set(self.mal_list)
    self.crawled = []
    self.crawled_set = set(self.crawled)
    self.links_set = set(self.links)
    self.queue.append(sys.argv[1])
    self.queue_set = set(self.queue)



def run(self, max_depth):
    print(self.queue)
    while self.count < max_depth:
        tgt = self.queue.pop(0)
        if tgt not in self.mal_set:
            self.crawl(tgt)
        else:
            print("Malicious Link Found: {0}".format(tgt)
            continue
    sys.exit("Finished!")


def crawl(self, tgt):
    url = urlparse(tgt)
    self.crawled.append(tgt)
    try:
        print("Crawling {0}".format(tgt))
        request = urllib2.Request(tgt)
        request.add_header("User-Agent", "Mozilla/5,0")
        opener = urllib2.build_opener()
        data = opener.open(request)
        self.count += 1

    except:
        return


    doc = parse(data).getroot()
    for tag in doc.xpath("//a[@href]"):
            old = tag.get('href')
            fixed = urllib.unquote(old)
            self.links.append(fixed)
            self.queue_links(self.links_set, url)


def queue_links(self, links, url):
        for link in links:
            if link.startswith('/'):
                link = "http://" + url.netloc + "/" + link

            elif link.startswith('#'):
                continue

            elif link.startswith('http'):

                link = 'http://' + url.netloc + '/' + link

            if link.decode('utf-8') not in self.crawled_set:
                self.queue.append(link)




def make_mal_list(self):
    """
    Open various malware and phishing related blacklists and create a list 
    of URLS from which to compare to the crawled links
    """
    hosts1 = "hosts.txt"
    hosts2 = "MH-sitelist.txt"
    hosts3 = "urls.txt"

    with open(hosts1) as first:
        for line1 in first.readlines():
            link = "http://" + line1.strip()
            self.mal_list.append(link)

    with open(hosts2) as second:
        for line2 in second.readlines():
            link = "http://" + line2.strip()
            self.mal_list.append(link)

    with open(hosts3) as third:
        for line3 in third.readlines():
            link = "http://" + line3.strip()
            self.mal_list.append(link)
def main():
    crawler = Crawler()
    crawler.make_mal_list()
    crawler.run(25)
if __name__ == "__main__":
  main()

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-05-17T22:44:44+00:00

First of all , i did get lost while reading your code so maybe i can give you some remark if i may before:

to many instance variable you don’t have to create a new instance var just to put on it a set() of another vars like this code : self.mal_set = set(self.mal_list)and you are repeating the same thing many times
if you want to use threading so use it, because in your code you are just creating one thread, for that you should create like (10) thread or so each thread will deal with a bunch of URL that he should fetch, and don’t forget to put the threads in a Queue.Queue to synchronize between them.
EDIT : Ahh i forgot : indent your code 🙂

now about your problem :

where do you assign self.queue because i don’t see it ? you are just calling the make_mal_list() method that will initialize only self.mal_listand after when you run you own thread i think it’s obvious that self.queue is empty so you can’t pop() right ?

EDIT 2:

i think your example is more complicate (using black list and all this stuff, …) but you can start with something like this:

import threading
import Queue
import sys
import urllib2
import url
from urlparse import urlparse

THREAD_NUMBER = 10


class Crawler(threading.Thread):

    def __init__(self, queue, mal_urls):
        self.queue = queue
        self.mal_list = mal_urls
        threading.Thread.__init__(self) # i forgot , thanks seriyPS :)

    def run(self):

        while True:
             # Grabs url to fetch from queue.
             url = self.queue.get()
             if url not in self.mal_list:
                 self.crawl(url)
             else:
                 print "Malicious Link Found: {0}".format(url)
             # Signals to queue job is done
             self.queue.task_done()

     def crawl(self, tgt):
         try:
             url = urlparse(tgt)
             print("Crawling {0}".format(tgt))
             request = urllib2.Request(tgt)
             request.add_header("User-Agent", "Mozilla/5,0")
             opener = urllib2.build_opener()
             data = opener.open(request)
         except: # TODO: write explicit exceptions the URLError, ValueERROR ...
             return

         doc = parse(data).getroot()
         for tag in doc.xpath("//a[@href]"):
             old = tag.get('href')
             fixed = urllib.unquote(old)

             # I don't think you need this, but maybe i'm mistaken.
             # self.links.append(fixed) 

             # Add more URL to the queue.
             self.queue_links(fixed, url)


    def queue_links(self, link, url):
        """I guess this method allow recursive download of urls that will
        be fetched from the web pages ????
        """

        #for link in links:  # i changed the argument so now links it just one url.
        if link.startswith('/'):
            link = "http://" + url.netloc + "/" + link

        elif link.startswith('#'):
            continue

        elif link.startswith('http'):
            link = 'http://' + url.netloc + '/' + link

        # Add urls extracted from the HTML text to the queue to fetche them
        if link.decode('utf-8') not in self.crawled_set:
            self.queue.put(link)


def get_make_mal_list():
    """Open various malware and phishing related blacklists and create a list 
    of URLS from which to compare to the crawled links
    """

    hosts1 = "hosts.txt"
    hosts2 = "MH-sitelist.txt"
    hosts3 = "urls.txt"

    mal_list = []

    with open(hosts1) as first:
        for line1 in first:
            link = "http://" + line1.strip()
            mal_list.append(link)

    with open(hosts2) as second:
        for line2 in second:
            link = "http://" + line2.strip()
            mal_list.append(link)

    with open(hosts3) as third:
        for line3 in third:
            link = "http://" + line3.strip()
            mal_list.append(link)

    return mal_list

def main():

    queue = Queue.Queue()

    # Get malicious URLs.
    mal_urls = set(get_make_mal_list())

    # Create a THREAD_NUMBER thread and start them.
    for i in xrange(THREAD_NUMBER):
        cr = Crawler(queue, mal_urls)
        cr.start()

    # Get all url that you want to fetch and put them in the queue.
    for url in sys.argv[1:]:
        queue.put(url)

    # Wait on the queue until everything has been processed.
    queue.join()


if __name__ == '__main__':
    main()

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I’m obviously missing something here. Same project I’ve been working on for a number

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply