Extract Email Addresses from HTML

  • Share this:

Code introduction


Extracts email addresses from the given HTML content.


Technology Stack : BeautifulSoup, regular expressions

Code Type : Function

Code Difficulty : Intermediate


                
                    
def extract_emails_from_html(html_content, tag='a'):
    """
    Extracts email addresses from the given HTML content.

    Args:
    html_content (str): The HTML content from which to extract emails.
    tag (str): The tag to search for within the HTML content. Default is 'a'.

    Returns:
    list: A list of extracted email addresses.
    """
    from bs4 import BeautifulSoup, SoupStrainer
    import re

    # Create a SoupStrainer to parse only the specified tag
    strainer = SoupStrainer(tag)
    soup = BeautifulSoup(html_content, 'html.parser', parse_only=strainer)

    # Function to extract emails using regex
    def extract_emails(text):
        return re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)

    # Extract text from the specified tag and find all emails
    emails = []
    for element in soup.find_all(text=True):
        emails.extend(extract_emails(str(element)))

    return emails