i’m trying to extract input fields from a website page and the page url (which contain these inputs ) and store them into a database … ok
*** code works fine with no errors , but this isn't the desired output i want
spider code :
class MySpider(CrawlSpider):
name = 'isa_spider'
allowed_domains = ['testaspnet.vulnweb.com']
start_urls = ['http://testaspnet.vulnweb.com']
rules = (
Rule(SgmlLinkExtractor(allow=('/*' ) ),callback='parse_item'),)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item=IsaItem()
item['response_fld']=response.url
res = hxs.select("//input[(@id or @name) and (@type = 'text' )]/@id ").extract()
item['text_input'] = res[0] if res else None # None is default value in case no field found
res = hxs.select("//input[(@id or @name) and (@type = 'password')]/@id").extract()
item['pass_input'] = res[0] if res else None # None is default value in case no field found
res = hxs.select("//input[(@id or @name) and (@type = 'file')]/@id").extract()
item['file_input'] = res[0] if res else None # None is default value in case no field found
return item
pipeline code
class SQLiteStorePipeline(object):
def __init__(self):
self.conn = sqlite3.connect('./project.db')
self.cur = self.conn.cursor()
def process_item(self, item, spider):
self.cur.execute("insert into inputs ( input_name) values(?)", (item['text_input'],))
self.cur.execute("insert into inputs ( input_name) values(?)", (item['pass_input'],))
self.cur.execute("insert into inputs ( input_name) values(?)", (item['file_input'],))
self.cur.execute("insert into links (link) values(?)", (item['response_fld'],))
self.conn.commit()
return item
database schema picture
required output picture
(sorry for not inserting images directly since my reputation is less than 10)
Haven’t tested this: