import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
class LocalSearchEngine:
def __init__(self, start_url, max_pages=50):
self.start_url = start_url
self.max_pages = max_pages
self.index = {} # url -> text
def crawl(self):
visited = set()
to_visit = [self.start_url]
print(f"[Crawler] Starting crawl at: {self.start_url}")
while to_visit and len(visited) < self.max_pages:
url = to_visit.pop(0)
if url in visited:
continue
print(f"[Crawler] Fetching: {url}")
try:
r = requests.get(url, timeout=5)
soup = BeautifulSoup(r.text, "html.parser")
text = soup.get_text(" ", strip=True)
self.index[url] = text
visited.add(url)
# discover new links
for link in soup.find_all("a", href=True):
new = urljoin(url, link["href"])
if urlparse(new).netloc == urlparse(self.start_url).netloc:
if new not in visited:
to_visit.append(new)
except Exception as e:
print(f"[Crawler] Error fetching {url}: {e}")
continue
print(f"[Crawler] Done. Indexed {len(self.index)} pages.\n")
def search(self, query):
q = query.lower()
results = []
for url, text in self.index.items():
if q in text.lower():
results.append(url)
return results
def main():
start_url = "https://incels.is/" # change this to your domain
engine = LocalSearchEngine(start_url, max_pages=30)
engine.crawl()
while True:
q = input("Search query: ").strip()
if not q:
continue
hits = engine.search(q)
print("\nResults:")
if not hits:
print(" No matches found.")
else:
for h in hits:
print(" -", h)
print()
if __name__ == "__main__":
main()