HCrawler 3.0 - crawler escrito em Python 3
Publicado por henrique (última atualização em 31/05/2024)
[ Hits: 1.202 ]
Homepage: https://mrh-ofici.github.io/
um crawler feito em Python 3 com muitos extras.
#!/bin/python3 import argparse, requests, re from bs4 import BeautifulSoup from colorama import Fore print('-'* 84) print(Fore.LIGHTRED_EX + """ ▄▀▀▄ ▄▄ ▄▀▄▄▄▄ ▄▀▀▄▀▀▀▄ ▄▀▀█▄ ▄▀▀▄ ▄▀▀▄ ▄▀▀▀▀▄ ▄▀▀█▄▄▄▄ ▄▀▀▄▀▀▀▄ █ █ ▄▀ █ █ ▌ █ █ █ ▐ ▄▀ ▀▄ █ █ ▐ █ █ █ ▐ ▄▀ ▐ █ █ █ ▐ █▄▄▄█ ▐ █ ▐ █▀▀█▀ █▄▄▄█ ▐ █ █ ▐ █ █▄▄▄▄▄ ▐ █▀▀█▀ █ █ █ ▄▀ █ ▄▀ █ █ ▄ █ █ █ ▌ ▄▀ █ ▄▀ ▄▀ ▄▀▄▄▄▄▀ █ █ █ ▄▀ ▀▄▀ ▀▄ ▄▀ ▄▀▄▄▄▄▄▄▀ ▄▀▄▄▄▄ █ █ █ █ █ ▐ ▐ ▐ ▐ ▐ ▀ █ █ ▐ ▐ ▐ ▐ ▐ ▐ ▐ ▐ \n""" + Fore.LIGHTGREEN_EX) parser = argparse.ArgumentParser(description='A detector de links and emails in websites, -h for help',usage='./hcrawler.py -d domain -l False -o output.txt -e True -c False -u "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"') parser.add_argument('-d', '--domain', action='store', dest='hosts', help='domain', required=True) parser.add_argument('-u', '--user-agent', action='store', dest='agent', help='user-agent', default="Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0") parser.add_argument('--cookies', action='store', dest='cookie', help='Cookies') parser.add_argument('--data', action='store', dest='data', help='{password : 123} with asps each element', type=hash) parser.add_argument('-o', '--output', action='store', dest='output', help='Save files of output') parser.add_argument('-l', '--links', action='store',dest='links',default='True', help='Mode links') parser.add_argument('-p', '--phone', dest='phones', help='Phone mode', action = 'store', default='False') parser.add_argument('-cn', '--cnpj', dest='cnpjs', help='Cnpj mode', action = 'store', default='False') parser.add_argument('-cp', '--cpf', dest='cpfs', help='Cpf mode', action = 'store', default='False') parser.add_argument('-e', '--email', dest='emails', help='Mode email', action = 'store', default='False') parser.add_argument('-c', dest='cont', help='Just crawling the website and not crawling your links', default='True', action='store') arguments = parser.parse_args() PHONES= [] EMAILS = [] TO_CRAWL = [] CPFS = [] CNPJS = [] CRAWLED = set() if arguments.output: print('[*] Mode output activated\n') file = open(arguments.output, 'w') def request(url): header = {"User-Agent": arguments.agent} try: if arguments.cookie: header['Cookie'] = arguments.cookie if arguments.data: data = arguments.data response = requests.get(url, headers=header, data=data) else: response = requests.get(url, headers=header) return response.text except KeyboardInterrupt: print('-' * 84) if arguments.output: file.close() arq = open(arguments.output, 'r') a = arq.read() if arguments.emails == "True": arqg = get_emails(a) if arguments.links == "True": arql = filter_links(a) if arguments.phones == "True": arqp = get_phone(a) if arguments.cpfs == "True": arqcp = get_cpf(a) if arguments.cnpjs == "True": arqcn = get_cnpj(a) arq.close() end = open(arguments.output, 'w') if arguments.phones == "True": end.write('\nPhones:\n\n') for phone in arqp: end.write(f'{phone}\n') if arguments.cpfs == "True": end.write('\nCpfs:\n\n') for cpf in arqcp: end.write(f'{cpf}\n') if arguments.cnpjs == "True": end.write('\nCnpjs:\n\n') for cnpj in arqcn: end.write(f'{cnpj}\n') if arguments.links == "True": end.write('\nLinks:\n\n') for link in arql: end.write(f'{link}\n') if arguments.emails == "True": end.write('\nEmails:\n\n') for email in arqg: end.write(f'{email}\n') exit() except: pass def get_cnpj(html): cnpjs = re.findall(r'\d{2}\.\d{3}\.\d{3}\/\d{4}\-\d{2}$', html) return cnpjs def get_cpf(html): cpfs = re.findall(r'\d{3}\.\d{3}\.\d{3}\-\d{2}$', html) return cpfs def get_links(html): links = [] try: soup = BeautifulSoup(html, "html.parser") tags_a = soup.find_all("a", href=True) for tag in tags_a: link = tag["href"] if link.startswith("http"): links.append(link) return links except: pass def get_emails(html): emails = re.findall(r"\w[\w\-\.]+?@\w[\w\-\.]+", html) return emails def filter_links(file): links = re.findall(r"https?\:\//[\w\.\-\/\?\=]+", file) return links def get_phone(html): phone = re.findall("\+\d+[ ]?\(?\d+\)?[ ]?\d+[-. ]?\d+", html) return phone def crawl(): if arguments.links == 'True': print('[*] Mode link activated\n') if arguments.emails == 'True': print('[*] Mode email activated\n') if arguments.cpfs == 'True': print('[*] Mode cpf activated\n') if arguments.cnpjs == 'True': print('[*] Mode cnpj activated\n') if arguments.phones == 'True': print('[*] Mode phone activated\n') while 1: if TO_CRAWL: url = TO_CRAWL.pop() html = request(url) if html: if arguments.links == 'True': links = get_links(html) if links: for link in links: if link not in CRAWLED and link not in TO_CRAWL: TO_CRAWL.append(link) print(f"[*] Crawling {url}\n") if arguments.emails == 'True': emails = get_emails(html) for email in emails: if email not in EMAILS: print(f'[*] {email}') EMAILS.append(email) if arguments.phones == 'True': phones = get_phone(html) for phone in phones: if phone not in PHONES: print(f'[*] {phone}\n') PHONES.append(phone) if arguments.phones: file.write(f'{phone}\n') if arguments.cpfs == 'True': cpfs = get_cpf(html) for cpf in cpfs: if cpf not in CPFS: print(f'[*] {cpf}\n') CPFS.append(cpf) if arguments.cpfs: file.write(f'{cpf}\n') if arguments.cnpjs == 'True': cnpjs = get_cnpj(html) for cnpj in cnpjs: if cnpj not in CNPJS: print(f'[*] {cnpj}\n') CNPJS.append(cnpj) if arguments.cnpjs: file.write(f'{cnpj}\n') CRAWLED.add(url) if arguments.output: file.write(f'{url}\n') else: CRAWLED.add(url) if arguments.output: file.write(f'{url}\n') else: print("Done!!!\n") break def crawlb(): if arguments.links == 'True': print('[*] Mode link activated\n') if arguments.emails == 'True': print('[*] Mode email activated\n') if arguments.phones == 'True': print('[*] Mode phone activated\n') if arguments.cpfs == 'True': print('[*] Mode cpf activated\n') if arguments.cnpjs == 'True': print('[*] Mode cnpj activated\n') if TO_CRAWL: url = TO_CRAWL.pop() html = request(url) if html: if arguments.links == 'True': links = get_links(html) for link in links: if link not in CRAWLED and link not in TO_CRAWL: TO_CRAWL.append(link) print(f'[*] {link}\n') if arguments.output: file.write(f'{link}\n') if arguments.emails == 'True': emails = get_emails(html) for email in emails: if email not in EMAILS: print(f'[*] {email}\n') EMAILS.append(email) if arguments.output: file.write(f'{email}\n') if arguments.phones == 'True': phones = get_phone(html) for phone in phones: if phone not in PHONES: print(f'[*] {phone}\n') PHONES.append(phone) if arguments.phones: file.write(f'{phone}\n') if arguments.cpfs == 'True': cpfs = get_cpf(html) for cpf in cpfs: if cpf not in CPFS: print(f'[*] {cpf}\n') CPFS.append(cpf) if arguments.cpfs: file.write(f'{cpf}\n') if arguments.cnpjs == 'True': cnpjs = get_cnpj(html) for cnpj in cnpjs: if cnpj not in CNPJS: print(f'[*] {cnpj}\n') CNPJS.append(cnpj) if arguments.cnpjs: file.write(f'{cnpj}\n') CRAWLED.add(url) if arguments.output: file.write(f'{url}\n') print('[*] Writing output...\n') else: CRAWLED.add(url) if arguments.output: file.write(f'{url}\n') print('[*] Writing output...\n') else: print("Done!!!") if __name__ == "__main__": url = arguments.hosts TO_CRAWL.append(url) if arguments.cont == 'True': crawl() else: crawlb() print('[*] Done!!!\n') print(Fore.RESET + '-' * 84) if arguments.output: file.close() arq = open(arguments.output, 'r') a = arq.read() if arguments.emails == "True": arqg = get_emails(a) if arguments.links == "True": arql = filter_links(a) if arguments.phones == "True": arqp = get_phone(a) if arguments.cpfs == "True": arqcp = get_cpf(a) if arguments.cnpjs == "True": arqcn = get_cnpj(a) arq.close() end = open(arguments.output, 'w') if arguments.phones == "True": end.write('\nPhones:\n\n') for phone in arqp: end.write(f'{phone}\n') if arguments.cpfs == "True": end.write('\nCpfs:\n\n') for cpf in arqcp: end.write(f'{cpf}\n') if arguments.cnpjs == "True": end.write('\nCnpjs:\n\n') for cnpj in arqcn: end.write(f'{cnpj}\n') if arguments.links == "True": end.write('\nLinks:\n\n') for link in arql: end.write(f'{link}\n') if arguments.emails == "True": end.write('\nEmails:\n\n') for email in arqg: end.write(f'{email}\n')
hcoin - Preço do Dólar, Bitcoin e Euro em Python
Script que obtém informações disponibilizadas a bots pelos sites
Compartilhando a tela do Computador no Celular via Deskreen
Como Configurar um Túnel SSH Reverso para Acessar Sua Máquina Local a Partir de uma Máquina Remota
Configuração para desligamento automatizado de Computadores em um Ambiente Comercial
Como renomear arquivos de letras maiúsculas para minúsculas
Imprimindo no formato livreto no Linux
Vim - incrementando números em substituição
Efeito "livro" em arquivos PDF
Como resolver o erro no CUPS: Unable to get list of printer drivers
Mensagem quando tento fazer o apt update && apt upgrade no kal... (2)
Melhores Práticas de Nomenclatura: Pastas, Arquivos e Código (0)
[Python] Automação de scan de vulnerabilidades
[Python] Script para analise de superficie de ataque
[Shell Script] Novo script para redimensionar, rotacionar, converter e espelhar arquivos de imagem
[Shell Script] Iniciador de DOOM (DSDA-DOOM, Doom Retro ou Woof!)
[Shell Script] Script para adicionar bordas às imagens de uma pasta