Extract and Write URLs from Text to JSON

  • Share this:

Code introduction


This function first attempts to read the text content from the specified file path, then uses regular expressions to extract URLs from the text, and finally writes these URLs to a specified JSON file.


Technology Stack : csv, re, time, random, os, json

Code Type : Function

Code Difficulty : Intermediate


                
                    
import csv
import re
import time
import random
import os
import json

def extract_text_from_file(file_path):
    """
    从文件中提取文本内容。

    :param file_path: 文件路径
    :return: 文本内容列表
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text_content = file.read()
            return text_content
    except FileNotFoundError:
        return []

def extract_urls(text):
    """
    从文本中提取URL链接。

    :param text: 文本内容
    :return: URL链接列表
    """
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.findall(text)

def write_urls_to_json(urls, output_file):
    """
    将URL链接列表写入JSON文件。

    :param urls: URL链接列表
    :param output_file: 输出文件路径
    """
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(urls, json_file, ensure_ascii=False, indent=4)

def main(input_file, output_file):
    """
    主函数,处理文件并提取URL,最后将结果写入JSON文件。

    :param input_file: 输入文件路径
    :param output_file: 输出文件路径
    """
    text = extract_text_from_file(input_file)
    urls = extract_urls(text)
    write_urls_to_json(urls, output_file)

# 示例用法
if __name__ == "__main__":
    main('input.txt', 'output.json')