Scrapy Crawler for Extracting Links from a URL

  • Share this:

Code introduction


This code defines a Scrapy crawler to extract all links from a specified URL. It uses CrawlerProcess to start the crawler and LinkExtractor to extract links.


Technology Stack : Scrapy, CrawlerProcess, Rule, LinkExtractor

Code Type : Scrapy reptile

Code Difficulty : Intermediate


                
                    
def extract_links_from_page(url):
    import scrapy
    from scrapy.crawler import CrawlerProcess
    from scrapy.spiders import Rule
    from scrapy.linkextractors import LinkExtractor

    def crawl_callback(spider, response):
        print("Found links:", response.css('a::attr(href)').getall())

    def start_crawl():
        process = CrawlerProcess(settings={
            'USER_AGENT': 'Scrapybot (+http://www.yourdomain.com)'
        })
        process.crawl(ScrapySpider, start_urls=[url])
        process.start(callback=crawl_callback)

    class ScrapySpider(scrapy.Spider):
        name = 'links_extractor'
        start_urls = [url]
        rules = (
            Rule(LinkExtractor(), callback='parse_item', follow=True),
        )

    start_crawl()