You can download this code by clicking the button below.
This code is now available for download.
Extracts email addresses from the given HTML content.
Technology Stack : BeautifulSoup, regular expressions
Code Type : Function
Code Difficulty : Intermediate
def extract_emails_from_html(html_content, tag='a'):
"""
Extracts email addresses from the given HTML content.
Args:
html_content (str): The HTML content from which to extract emails.
tag (str): The tag to search for within the HTML content. Default is 'a'.
Returns:
list: A list of extracted email addresses.
"""
from bs4 import BeautifulSoup, SoupStrainer
import re
# Create a SoupStrainer to parse only the specified tag
strainer = SoupStrainer(tag)
soup = BeautifulSoup(html_content, 'html.parser', parse_only=strainer)
# Function to extract emails using regex
def extract_emails(text):
return re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
# Extract text from the specified tag and find all emails
emails = []
for element in soup.find_all(text=True):
emails.extend(extract_emails(str(element)))
return emails