3.1 任务调度器的实现
import queue import threading class TaskScheduler: def __init__(self, max_threads): self.max_threads = max_threads self.task_queue = queue.Queue() self.thread_pool = [] def add_task(self, url): self.task_queue.put(url) def start(self): for _ in range(self.max_threads): thread = threading.Thread(target=self.worker) thread.start() self.thread_pool.append(thread) def worker(self): while True: url = self.task_queue.get() if url is None: # Sentinel value indicating the end of the queue break # Perform the crawling task here, e.g., send a request to the URL and parse the response. print(f"Crawling {url}") # Mark the task as completed by removing it from the queue (or use a different mechanism to indicate completion). self.task_queue.task_done() # Join the thread to ensure it completes before the scheduler stops. self.thread_pool.remove(threading.current_thread())
3.2 爬虫引擎的实现
import requests from bs4 import BeautifulSoup import json import sqlite3 from sqlite3 import Error class CrawlerEngine: def __init__(self, db_path): self.db_path = db_path self._create_connection() # Create a database connection (if it doesn't exist) and return a cursor object. self._create_table() # Create a table to store the crawled data if it doesn't exist. def _create_connection(self): try: conn = sqlite3.connect(self.db_path) # Connect to the SQLite database (if it exists). Otherwise, create a new one with the specified name and return a cursor object for it. return conn # Return the cursor object for further use in other methods of this class (e.g., inserting data into tables). except Error as e: # If an error occurs while trying to connect to or create the database, print an error message and raise an exception with the error details for further handling in the calling code (e.g., by a try-except block). print(e) # Print the error message for debugging purposes (optional). raise e # Raise the exception with the error details for further handling in the calling code (e.g., by a try-except block). def _create_table(self): # Define a method to create a table if it doesn't exist (optional). This method can be used to create tables for different types of data depending on your needs (e.g., one table for URLs, another for extracted data). try: # Use a try-except block to handle any errors that may occur during table creation (e.g., if the table already exists). sql_create_urls_table = """CREATE TABLE IF NOT EXISTS urls (id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT);""" # Define an SQL statement to create a table named "urls" with two columns: "id" (an auto-incrementing primary key) and "url" (a column to store URLs). cur = self._conn.cursor() # Get a cursor object from the connection created in the __init__ method (or from a new connection if needed). cur.execute(sql_create_urls_table) # Execute the SQL statement to create the table if it doesn't exist already (or do nothing if it does exist). cur.close() # Close the cursor object after executing the SQL statement (optional but recommended for good practice). except Error as e: # If an error occurs during table creation (e.g., if there's already a table with the same name), print an error message and raise an exception with the error details for further handling in the calling code (e.g., by a try-except block). print(e) # Print the error message for debugging purposes (optional). raise e # Raise the exception with the error details for further handling in the calling code (e.g., by a try-except block). def fetch_url(self, url): # Define a method to fetch a URL and return its content as a string (optional but useful for testing purposes). This method can be used to fetch URLs from a list or other sources and store them in the database or perform other operations as needed before crawling them later on using another method (e.g., crawl_url). try: # Use a try-except block to handle any errors that may occur during URL fetching (e.g., network issues or invalid URLs). response = requests.get(url) # Use the requests library to fetch the URL content as a response object (assuming you've installed requests using pip install requests). return response.text # Return the content of the response object as a string (or raise an exception if there's an error during fetching). Note that this method doesn't handle exceptions directly; instead, it relies on requests' built-in error handling mechanisms which raise exceptions when something goes wrong during fetching (e.g., if there's no internet connection or if there's an invalid URL). 