You can download this code by clicking the button below.
This code is now available for download.
This code defines a Scrapy crawler to extract all links from a specified URL. It uses CrawlerProcess to start the crawler and LinkExtractor to extract links.
Technology Stack : Scrapy, CrawlerProcess, Rule, LinkExtractor
Code Type : Scrapy reptile
Code Difficulty : Intermediate
def extract_links_from_page(url):
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor
def crawl_callback(spider, response):
print("Found links:", response.css('a::attr(href)').getall())
def start_crawl():
process = CrawlerProcess(settings={
'USER_AGENT': 'Scrapybot (+http://www.yourdomain.com)'
})
process.crawl(ScrapySpider, start_urls=[url])
process.start(callback=crawl_callback)
class ScrapySpider(scrapy.Spider):
name = 'links_extractor'
start_urls = [url]
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
start_crawl()