āĻļā§āĻ āĻ
āĻĒāĻ°āĻžāĻšā§āĻ¨. āĻāĻāĻŋ āĻ˛ā§āĻāĻžāĻ° āĻĒāĻ° 2 āĻŦāĻāĻ° āĻšāĻ¯āĻŧā§ āĻā§āĻā§āĨ¤
āĻ¯āĻāĻ¨ āĻāĻŽāĻŋ āĻšāĻžāĻŦāĻ°ā§āĻ° āĻāĻāĻāĻŋ āĻ āĻ¨ā§āĻ˛āĻŋāĻĒāĻŋ āĻĒā§āĻ¤ā§ āĻā§āĻ¯āĻŧā§āĻāĻŋāĻ˛āĻžāĻŽ, āĻ¤āĻāĻ¨ āĻāĻŽāĻŋ āĻāĻāĻāĻŋ āĻĒāĻžāĻ°ā§āĻ¸āĻžāĻ° āĻ˛ā§āĻāĻžāĻ° āĻ¸āĻŋāĻĻā§āĻ§āĻžāĻ¨ā§āĻ¤ āĻ¨āĻŋāĻ¯āĻŧā§āĻāĻŋāĻ˛āĻžāĻŽ āĻ¯āĻž āĻ˛ā§āĻāĻāĻĻā§āĻ° āĻ¸āĻŽāĻ¸ā§āĻ¤ āĻŦāĻŋāĻˇāĻ¯āĻŧāĻŦāĻ¸ā§āĻ¤ā§ āĻĄāĻžāĻāĻžāĻŦā§āĻ¸ā§ āĻ¸āĻāĻ°āĻā§āĻˇāĻŖ āĻāĻ°āĻŦā§āĨ¤ āĻāĻāĻŋ āĻā§āĻāĻžāĻŦā§ āĻāĻā§āĻā§ āĻāĻŦāĻ āĻāĻŽāĻŋ āĻā§ āĻ¤ā§āĻ°ā§āĻāĻŋāĻ° āĻ¸āĻŽā§āĻŽā§āĻā§āĻ¨ āĻšāĻ¯āĻŧā§āĻāĻŋ - āĻāĻĒāĻ¨āĻŋ āĻāĻžāĻāĻžāĻ° āĻ¨ā§āĻā§ āĻĒāĻĄāĻŧāĻ¤ā§ āĻĒāĻžāĻ°ā§āĻ¨āĨ¤
TL;DR -
āĻĒāĻžāĻ°ā§āĻ¸āĻžāĻ° āĻĒā§āĻ°āĻĨāĻŽ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖ. āĻāĻ āĻĨā§āĻ°ā§āĻĄ, āĻ āĻ¨ā§āĻ āĻ¸āĻŽāĻ¸ā§āĻ¯āĻž
āĻļā§āĻ°ā§ āĻāĻ°āĻžāĻ° āĻāĻ¨ā§āĻ¯, āĻāĻŽāĻŋ āĻāĻāĻāĻŋ āĻ¸ā§āĻā§āĻ°āĻŋāĻĒā§āĻ āĻĒā§āĻ°ā§āĻā§āĻāĻžāĻāĻĒ āĻ¤ā§āĻ°āĻŋ āĻāĻ°āĻžāĻ° āĻ¸āĻŋāĻĻā§āĻ§āĻžāĻ¨ā§āĻ¤ āĻ¨āĻŋāĻ¯āĻŧā§āĻāĻŋ āĻ¯ā§āĻāĻžāĻ¨ā§ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§āĻāĻŋ āĻĒāĻžāĻ°ā§āĻ¸ āĻāĻ°āĻž āĻšāĻŦā§ āĻāĻŦāĻ āĻĄāĻžāĻāĻ¨āĻ˛ā§āĻĄ āĻāĻ°āĻžāĻ° āĻ¸āĻžāĻĨā§ āĻ¸āĻžāĻĨā§ āĻĄāĻžāĻāĻžāĻŦā§āĻ¸ā§ āĻ¸ā§āĻĨāĻžāĻĒāĻ¨ āĻāĻ°āĻž āĻšāĻŦā§āĨ¤ āĻĻā§āĻŦāĻžāĻ° āĻāĻŋāĻ¨ā§āĻ¤āĻž āĻ¨āĻž āĻāĻ°ā§, āĻāĻŽāĻŋ sqlite3 āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻāĻ°ā§āĻāĻŋ, āĻāĻžāĻ°āĻŖāĨ¤ āĻāĻāĻŋ āĻāĻŽ āĻļā§āĻ°āĻŽ-āĻ¨āĻŋāĻŦāĻŋāĻĄāĻŧ āĻāĻŋāĻ˛: āĻāĻāĻāĻŋ āĻ¸ā§āĻĨāĻžāĻ¨ā§āĻ¯āĻŧ āĻ¸āĻžāĻ°ā§āĻāĻžāĻ°ā§āĻ° āĻĒā§āĻ°āĻ¯āĻŧā§āĻāĻ¨ āĻ¨ā§āĻ, āĻ¤ā§āĻ°āĻŋ-āĻĻā§āĻāĻž-āĻŽā§āĻā§ āĻĢā§āĻ˛āĻž āĻāĻŦāĻ āĻāĻ° āĻŽāĻ¤ā§ āĻāĻŋāĻ¨āĻŋāĻ¸āĻĒāĻ¤ā§āĻ°āĨ¤
one_thread.py
from bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime
def main(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content TEXT, tags TEXT)")
start_time = datetime.now()
c.execute("begin")
for i in range(min, max):
url = "https://m.habr.com/post/{}".format(i)
try:
r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
continue
if(r.status_code != 200):
print("{} - {}".format(i, r.status_code))
continue
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')
try:
author = soup.find(class_="tm-user-info__username").get_text()
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
except:
author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
content = "ĐŅи ĐŋĐ°ŅŅиĐŊĐŗĐĩ ŅŅОК ŅŅŅĐ°ĐŊиŅĐĩ ĐŋŅОиСОŅĐģĐ° ĐžŅийĐēĐ°."
c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
print(i)
c.execute("commit")
print(datetime.now() - start_time)
main(1, 490406)
āĻ¸āĻŦāĻāĻŋāĻā§ āĻā§āĻ˛āĻžāĻ¸āĻŋāĻ - āĻāĻŽāĻ°āĻž āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻāĻ°āĻŋ āĻ¸ā§āĻ¨ā§āĻĻāĻ° āĻ¸ā§āĻ¯ā§āĻĒ, āĻ āĻ¨ā§āĻ°ā§āĻ§ āĻāĻŦāĻ āĻāĻāĻāĻŋ āĻĻā§āĻ°ā§āĻ¤ āĻĒā§āĻ°ā§āĻā§āĻāĻžāĻāĻĒ āĻĒā§āĻ°āĻ¸ā§āĻ¤ā§āĻ¤āĨ¤ āĻ¯ā§ āĻļā§āĻ§ā§âĻ
-
āĻĒā§āĻˇā§āĻ āĻž āĻĄāĻžāĻāĻ¨āĻ˛ā§āĻĄ āĻāĻ āĻĨā§āĻ°ā§āĻĄ
-
āĻāĻĒāĻ¨āĻŋ āĻ¯āĻĻāĻŋ āĻ¸ā§āĻā§āĻ°āĻŋāĻĒā§āĻā§āĻ° āĻ¸āĻŽā§āĻĒāĻžāĻĻāĻ¨ā§ āĻŦāĻžāĻ§āĻž āĻĻā§āĻ¨, āĻ¤āĻžāĻšāĻ˛ā§ āĻĒā§āĻ°ā§ āĻĄāĻžāĻāĻžāĻŦā§āĻ¸āĻāĻŋ āĻā§āĻĨāĻžāĻ āĻ¯āĻžāĻŦā§ āĻ¨āĻžāĨ¤ āĻ¸āĻŦ āĻĒāĻ°ā§, āĻĒā§āĻ°āĻ¤āĻŋāĻļā§āĻ°ā§āĻ¤āĻŋ āĻļā§āĻ§ā§āĻŽāĻžāĻ¤ā§āĻ° āĻ¸āĻŽāĻ¸ā§āĻ¤ āĻĒāĻžāĻ°ā§āĻ¸āĻŋāĻ āĻĒāĻ°ā§ āĻ¸āĻā§āĻāĻžāĻ˛āĻŋāĻ¤ āĻšāĻ¯āĻŧ.
āĻ āĻŦāĻļā§āĻ¯āĻ, āĻāĻĒāĻ¨āĻŋ āĻĒā§āĻ°āĻ¤āĻŋāĻāĻŋ āĻ¸āĻ¨ā§āĻ¨āĻŋāĻŦā§āĻļā§āĻ° āĻĒāĻ°ā§ āĻĄāĻžāĻāĻžāĻŦā§āĻ¸ā§ āĻĒāĻ°āĻŋāĻŦāĻ°ā§āĻ¤āĻ¨ āĻāĻ°āĻ¤ā§ āĻĒāĻžāĻ°ā§āĻ¨, āĻāĻŋāĻ¨ā§āĻ¤ā§ āĻ¤āĻžāĻ°āĻĒāĻ° āĻ¸ā§āĻā§āĻ°āĻŋāĻĒā§āĻ āĻāĻžāĻ°ā§āĻ¯āĻāĻ° āĻāĻ°āĻžāĻ° āĻ¸āĻŽāĻ¯āĻŧ āĻāĻ˛ā§āĻ˛ā§āĻāĻ¯ā§āĻā§āĻ¯āĻāĻžāĻŦā§ āĻŦā§āĻĻā§āĻ§āĻŋ āĻĒāĻžāĻŦā§āĨ¤ -
āĻĒā§āĻ°āĻĨāĻŽ 100 āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ āĻŦāĻŋāĻļā§āĻ˛ā§āĻˇāĻŖ āĻāĻ°āĻ¤ā§ āĻāĻŽāĻžāĻ° 000 āĻāĻ¨ā§āĻāĻž āĻ¸āĻŽāĻ¯āĻŧ āĻ˛ā§āĻā§āĻā§āĨ¤
āĻĒāĻ°āĻŦāĻ°ā§āĻ¤ā§ āĻāĻŽāĻŋ āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ°āĻāĻžāĻ°ā§āĻ° āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ āĻā§āĻāĻā§
- āĻŽāĻžāĻ˛ā§āĻāĻŋāĻĨā§āĻ°ā§āĻĄāĻŋāĻ āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻāĻ°ā§ āĻ āĻ¨ā§āĻ āĻ¸āĻŽāĻ¯āĻŧ āĻĄāĻžāĻāĻ¨āĻ˛ā§āĻĄā§āĻ° āĻāĻ¤āĻŋ āĻŦāĻžāĻĄāĻŧā§āĨ¤
- āĻāĻĒāĻ¨āĻŋ āĻšāĻžāĻŦāĻ°ā§āĻ° āĻ¸āĻŽā§āĻĒā§āĻ°ā§āĻŖ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖ āĻĒā§āĻ¤ā§ āĻĒāĻžāĻ°ā§āĻ¨ āĻ¨āĻž, āĻ¤āĻŦā§ āĻāĻ° āĻŽā§āĻŦāĻžāĻāĻ˛ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖāĨ¤
āĻāĻĻāĻžāĻšāĻ°āĻŖāĻ¸ā§āĻŦāĻ°ā§āĻĒ, āĻ¯āĻĻāĻŋ āĻĄā§āĻ¸ā§āĻāĻāĻĒ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖā§ āĻāĻāĻāĻŋ āĻ¸āĻŽāĻ¨ā§āĻŦāĻŋāĻ¤ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ā§āĻ° āĻāĻāĻ¨ 378 KB āĻšāĻ¯āĻŧ, āĻ¤āĻŦā§ āĻŽā§āĻŦāĻžāĻāĻ˛ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖā§ āĻāĻāĻŋ āĻāĻ¤āĻŋāĻŽāĻ§ā§āĻ¯ā§ 126 KBāĨ¤
āĻĻā§āĻŦāĻŋāĻ¤ā§āĻ¯āĻŧ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖāĨ¤ āĻ āĻ¨ā§āĻ āĻĨā§āĻ°ā§āĻĄ, āĻšāĻžāĻŦāĻ° āĻĨā§āĻā§ āĻ¸āĻžāĻŽāĻ¯āĻŧāĻŋāĻ āĻ¨āĻŋāĻˇā§āĻ§āĻžāĻā§āĻāĻž
āĻ¯āĻāĻ¨ āĻāĻŽāĻŋ āĻĒāĻžāĻāĻĨāĻ¨ā§ āĻŽāĻžāĻ˛ā§āĻāĻŋāĻĨā§āĻ°ā§āĻĄāĻŋāĻāĻ¯āĻŧā§āĻ° āĻŦāĻŋāĻˇāĻ¯āĻŧā§ āĻāĻ¨ā§āĻāĻžāĻ°āĻ¨ā§āĻ āĻāĻžāĻāĻāĻžāĻāĻžāĻāĻāĻŋ āĻāĻ°āĻŋ, āĻ¤āĻāĻ¨ āĻāĻŽāĻŋ multiprocessing.dummy-āĻāĻ° āĻ¸āĻžāĻĨā§ āĻ¸āĻŦāĻā§āĻ¯āĻŧā§ āĻ¸āĻšāĻ āĻŦāĻŋāĻāĻ˛ā§āĻĒāĻāĻŋ āĻŦā§āĻā§ āĻ¨āĻŋāĻ¯āĻŧā§āĻāĻŋāĻ˛āĻžāĻŽ, āĻāĻŽāĻŋ āĻ˛āĻā§āĻˇā§āĻ¯ āĻāĻ°ā§āĻāĻŋ āĻ¯ā§ āĻŽāĻžāĻ˛ā§āĻāĻŋāĻĨā§āĻ°ā§āĻĄāĻŋāĻāĻ¯āĻŧā§āĻ° āĻ¸āĻžāĻĨā§ āĻ¸āĻŽāĻ¸ā§āĻ¯āĻž āĻĻā§āĻāĻž āĻĻāĻŋāĻ¯āĻŧā§āĻā§āĨ¤
SQLite3 āĻāĻāĻžāĻ§āĻŋāĻ āĻĨā§āĻ°ā§āĻĄā§āĻ° āĻ¸āĻžāĻĨā§ āĻāĻžāĻ āĻāĻ°āĻ¤ā§ āĻāĻžāĻ¯āĻŧ āĻ¨āĻž.
āĻ¸ā§āĻĨāĻŋāĻ° check_same_thread=False
, āĻāĻŋāĻ¨ā§āĻ¤ā§ āĻāĻ āĻ¤ā§āĻ°ā§āĻāĻŋāĻāĻŋ āĻāĻāĻŽāĻžāĻ¤ā§āĻ° āĻ¨āĻ¯āĻŧ, āĻĄāĻžāĻāĻžāĻŦā§āĻ¸ā§ āĻ¸āĻ¨ā§āĻ¨āĻŋāĻŦā§āĻļ āĻāĻ°āĻžāĻ° āĻā§āĻˇā§āĻāĻž āĻāĻ°āĻžāĻ° āĻ¸āĻŽāĻ¯āĻŧ, āĻāĻāĻ¨āĻ āĻāĻāĻ¨āĻ āĻ¤ā§āĻ°ā§āĻāĻŋ āĻāĻā§ āĻ¯āĻž āĻāĻŽāĻŋ āĻ¸āĻŽāĻžāĻ§āĻžāĻ¨ āĻāĻ°āĻ¤ā§ āĻĒāĻžāĻ°āĻŋāĻ¨āĻŋāĨ¤
āĻ āĻ¤āĻāĻŦ, āĻāĻŽāĻŋ āĻ¸āĻ°āĻžāĻ¸āĻ°āĻŋ āĻĄāĻžāĻāĻžāĻŦā§āĻ¸ā§ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§āĻā§āĻ˛āĻŋāĻ° āĻ¤āĻžāĻ¤ā§āĻā§āĻˇāĻŖāĻŋāĻ āĻ¸āĻ¨ā§āĻ¨āĻŋāĻŦā§āĻļ āĻ¤ā§āĻ¯āĻžāĻ āĻāĻ°āĻžāĻ° āĻ¸āĻŋāĻĻā§āĻ§āĻžāĻ¨ā§āĻ¤ āĻ¨āĻŋāĻ¯āĻŧā§āĻāĻŋ āĻāĻŦāĻ āĻ¸āĻŽāĻ¨ā§āĻŦāĻŋāĻ¤ āĻ¸āĻŽāĻžāĻ§āĻžāĻ¨āĻāĻŋ āĻŽāĻ¨ā§ āĻ°ā§āĻā§ āĻāĻŽāĻŋ āĻĢāĻžāĻāĻ˛āĻā§āĻ˛āĻŋ āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻāĻ°āĻžāĻ° āĻ¸āĻŋāĻĻā§āĻ§āĻžāĻ¨ā§āĻ¤ āĻ¨āĻŋāĻ¯āĻŧā§āĻāĻŋ, āĻāĻžāĻ°āĻŖ āĻā§āĻ¨āĻ āĻĢāĻžāĻāĻ˛ā§ āĻŽāĻžāĻ˛ā§āĻāĻŋ-āĻĨā§āĻ°ā§āĻĄā§āĻĄ āĻ˛ā§āĻāĻžāĻ° āĻ¸āĻžāĻĨā§ āĻā§āĻ¨āĻ āĻ¸āĻŽāĻ¸ā§āĻ¯āĻž āĻ¨ā§āĻāĨ¤
āĻšāĻžāĻŦāĻ° āĻ¤āĻŋāĻ¨āĻāĻŋ āĻĨā§āĻ°ā§āĻĄā§āĻ° āĻŦā§āĻļāĻŋ āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻāĻ°āĻžāĻ° āĻāĻ¨ā§āĻ¯ āĻ¨āĻŋāĻˇāĻŋāĻĻā§āĻ§ āĻāĻ°āĻž āĻļā§āĻ°ā§ āĻāĻ°ā§.
āĻšāĻžāĻŦāĻ°ā§ āĻ¯āĻžāĻāĻ¯āĻŧāĻžāĻ° āĻāĻ¨ā§āĻ¯ āĻŦāĻŋāĻļā§āĻˇāĻ¤ āĻāĻĻā§āĻ¯ā§āĻā§ āĻĒā§āĻ°āĻā§āĻˇā§āĻāĻž āĻāĻ¯āĻŧā§āĻ āĻāĻ¨ā§āĻāĻžāĻ° āĻāĻ¨ā§āĻ¯ āĻāĻāĻāĻŋ āĻāĻāĻĒāĻŋ āĻ¨āĻŋāĻˇā§āĻ§āĻžāĻā§āĻāĻžāĻ° āĻ¸āĻžāĻĨā§ āĻļā§āĻˇ āĻšāĻ¤ā§ āĻĒāĻžāĻ°ā§āĨ¤ āĻ¸ā§āĻ¤āĻ°āĻžāĻ āĻāĻĒāĻ¨āĻžāĻā§ āĻļā§āĻ§ā§āĻŽāĻžāĻ¤ā§āĻ° 3āĻāĻŋ āĻĨā§āĻ°ā§āĻĄ āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻāĻ°āĻ¤ā§ āĻšāĻŦā§, āĻ¤āĻŦā§ āĻāĻāĻŋ āĻāĻ¤āĻŋāĻŽāĻ§ā§āĻ¯ā§āĻ āĻāĻžāĻ˛, āĻ¯ā§āĻšā§āĻ¤ā§ 100 āĻāĻŋāĻ°āĻ āĻŦā§āĻļāĻŋ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ āĻĒā§āĻ¨āĻ°āĻžāĻŦā§āĻ¤ā§āĻ¤āĻŋ āĻāĻ°āĻžāĻ° āĻ¸āĻŽāĻ¯āĻŧ 26 āĻĨā§āĻā§ 12 āĻ¸ā§āĻā§āĻ¨ā§āĻĄā§ āĻāĻŽā§ āĻā§āĻā§āĨ¤
āĻāĻāĻŋ āĻ˛āĻā§āĻˇāĻŖā§āĻ¯āĻŧ āĻ¯ā§ āĻāĻ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖāĻāĻŋ āĻŦāĻ°āĻ āĻ āĻ¸ā§āĻĨāĻŋāĻ°, āĻāĻŦāĻ āĻĄāĻžāĻāĻ¨āĻ˛ā§āĻĄāĻā§āĻ˛āĻŋ āĻĒāĻ°ā§āĻ¯āĻžāĻ¯āĻŧāĻā§āĻ°āĻŽā§ āĻĒā§āĻ°āĻā§āĻ° āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ā§ āĻĒāĻĄāĻŧā§āĨ¤
async_v1.py
from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/post/{}".format(i)
try: r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
# ĐĐ°ĐŋиŅŅ СайĐģĐžĐēиŅОваĐŊĐŊŅŅ
СаĐŋŅĐžŅОв ĐŊĐ° ŅĐĩŅвĐĩŅ
if (r.status_code == 503):
with open("Error503.txt", "a") as write_file:
write_file.write(str(i) + "n")
logging.warning('{} / 503 Error'.format(i))
# ĐŅĐģи ĐŋĐžŅŅĐ° ĐŊĐĩ ŅŅŅĐĩŅŅвŅĐĩŅ иĐģи ĐžĐŊ ĐąŅĐģ ŅĐēŅŅŅ
if (r.status_code != 200):
logging.info("{} / {} Code".format(i, r.status_code))
return r.status_code
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html5lib')
try:
author = soup.find(class_="tm-user-info__username").get_text()
timestamp = soup.find(class_='tm-user-meta__date')
timestamp = timestamp['title']
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
# ĐĐĩŅĐēĐ°, ŅŅĐž ĐŋĐžŅŅ ŅвĐģŅĐĩŅŅŅ ĐŋĐĩŅĐĩвОдОĐŧ иĐģи ŅŅŅĐžŅиаĐģĐžĐŧ.
tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()
rating = soup.find(class_="tm-votes-score").get_text()
except:
author = title = tags = timestamp = tm_tag = rating = "Error"
content = "ĐŅи ĐŋĐ°ŅŅиĐŊĐŗĐĩ ŅŅОК ŅŅŅĐ°ĐŊиŅĐĩ ĐŋŅОиСОŅĐģĐ° ĐžŅийĐēĐ°."
logging.warning("Error parsing - {}".format(i))
with open("Errors.txt", "a") as write_file:
write_file.write(str(i) + "n")
# ĐĐ°ĐŋиŅŅваĐĩĐŧ ŅŅĐ°ŅŅŅ в json
try:
article = [i, timestamp, author, title, content, tm_tag, rating, tags]
with open(currentFile, "w") as write_file:
json.dump(article, write_file)
except:
print(i)
raise
if __name__ == '__main__':
if len(sys.argv) < 3:
print("ĐĐĩОйŅ
ОдиĐŧŅ ĐŋĐ°ŅĐ°ĐŧĐĩŅŅŅ min и max. ĐŅĐŋĐžĐģŅСОваĐŊиĐĩ: async_v1.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
# ĐŅĐģи ĐŋĐžŅĐžĐēОв >3
# ŅĐž Ņ
Đ°ĐąŅ йаĐŊиŅ ipŅĐŊиĐē ĐŊĐ° вŅĐĩĐŧŅ
pool = ThreadPool(3)
# ĐŅŅŅĐĩŅ вŅĐĩĐŧĐĩĐŊи, СаĐŋŅŅĐē ĐŋĐžŅĐžĐēОв
start_time = datetime.now()
results = pool.map(worker, range(min, max))
# ĐĐžŅĐģĐĩ СаĐēŅŅŅиŅ вŅĐĩŅ
ĐŋĐžŅĐžĐēОв ĐŋĐĩŅĐ°ŅĐ°ĐĩĐŧ вŅĐĩĐŧŅ
pool.close()
pool.join()
print(datetime.now() - start_time)
āĻ¤ā§āĻ¤ā§āĻ¯āĻŧ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖāĨ¤ āĻĢāĻžāĻāĻ¨āĻžāĻ˛
āĻĻā§āĻŦāĻŋāĻ¤ā§āĻ¯āĻŧ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖāĻāĻŋ āĻĄāĻŋāĻŦāĻžāĻ āĻāĻ°āĻžāĻ° āĻ¸āĻŽāĻ¯āĻŧ, āĻāĻŽāĻŋ āĻāĻŦāĻŋāĻˇā§āĻāĻžāĻ° āĻāĻ°ā§āĻāĻŋ āĻ¯ā§ Habr, āĻšāĻ āĻžā§ āĻāĻ°ā§, āĻāĻāĻāĻŋ API āĻāĻā§ āĻ¯āĻž āĻ¸āĻžāĻāĻā§āĻ° āĻŽā§āĻŦāĻžāĻāĻ˛ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖ āĻ ā§āĻ¯āĻžāĻā§āĻ¸ā§āĻ¸ āĻāĻ°ā§āĨ¤ āĻāĻāĻŋ āĻŽā§āĻŦāĻžāĻāĻ˛ āĻ¸āĻāĻ¸ā§āĻāĻ°āĻŖā§āĻ° āĻā§āĻ¯āĻŧā§ āĻĻā§āĻ°ā§āĻ¤ āĻ˛ā§āĻĄ āĻšāĻ¯āĻŧ, āĻ¯ā§āĻšā§āĻ¤ā§ āĻāĻāĻŋ āĻļā§āĻ§ā§āĻŽāĻžāĻ¤ā§āĻ° json, āĻ¯āĻž āĻāĻŽāĻ¨āĻāĻŋ āĻĒāĻžāĻ°ā§āĻ¸ āĻāĻ°āĻžāĻ° āĻĒā§āĻ°āĻ¯āĻŧā§āĻāĻ¨ āĻ¨ā§āĻā§ˇ āĻļā§āĻˇ āĻĒāĻ°ā§āĻ¯āĻ¨ā§āĻ¤, āĻāĻŽāĻŋ āĻāĻŦāĻžāĻ° āĻāĻŽāĻžāĻ° āĻ¸ā§āĻā§āĻ°āĻŋāĻĒā§āĻ āĻĒā§āĻ¨āĻ°ā§āĻ˛āĻŋāĻāĻ¨ āĻāĻ°āĻžāĻ° āĻ¸āĻŋāĻĻā§āĻ§āĻžāĻ¨ā§āĻ¤ āĻ¨āĻŋāĻ¯āĻŧā§āĻāĻŋāĨ¤
āĻ¸ā§āĻ¤āĻ°āĻžāĻ, āĻā§āĻāĻā§ āĻĒā§āĻ¯āĻŧā§
async_v2.py
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)
try:
r = requests.get(url)
if r.status_code == 503:
logging.critical("503 Error")
return 503
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
data = json.loads(r.text)
if data['success']:
article = data['data']['article']
id = article['id']
is_tutorial = article['is_tutorial']
time_published = article['time_published']
comments_count = article['comments_count']
lang = article['lang']
tags_string = article['tags_string']
title = article['title']
content = article['text_html']
reading_count = article['reading_count']
author = article['author']['login']
score = article['voting']['score']
data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
with open(currentFile, "w") as write_file:
json.dump(data, write_file)
if __name__ == '__main__':
if len(sys.argv) < 3:
print("ĐĐĩОйŅ
ОдиĐŧŅ ĐŋĐ°ŅĐ°ĐŧĐĩŅŅŅ min и max. ĐŅĐŋĐžĐģŅСОваĐŊиĐĩ: asyc.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
# ĐŅĐģи ĐŋĐžŅĐžĐēОв >3
# ŅĐž Ņ
Đ°ĐąŅ йаĐŊиŅ ipŅĐŊиĐē ĐŊĐ° вŅĐĩĐŧŅ
pool = ThreadPool(3)
# ĐŅŅŅĐĩŅ вŅĐĩĐŧĐĩĐŊи, СаĐŋŅŅĐē ĐŋĐžŅĐžĐēОв
start_time = datetime.now()
results = pool.map(worker, range(min, max))
# ĐĐžŅĐģĐĩ СаĐēŅŅŅиŅ вŅĐĩŅ
ĐŋĐžŅĐžĐēОв ĐŋĐĩŅĐ°ŅĐ°ĐĩĐŧ вŅĐĩĐŧŅ
pool.close()
pool.join()
print(datetime.now() - start_time)
āĻāĻ¤ā§ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§āĻāĻŋ āĻāĻŦāĻ āĻ˛ā§āĻāĻ āĻ¯āĻŋāĻ¨āĻŋ āĻāĻāĻŋ āĻ˛āĻŋāĻā§āĻā§āĻ¨ āĻāĻāĻ¯āĻŧā§āĻ° āĻ¸āĻžāĻĨā§ āĻ¸āĻŽā§āĻĒāĻ°ā§āĻāĻŋāĻ¤ āĻā§āĻˇā§āĻ¤ā§āĻ° āĻ°āĻ¯āĻŧā§āĻā§ā§ˇ
API.png
āĻāĻŽāĻŋ āĻĒā§āĻ°āĻ¤āĻŋāĻāĻŋ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ā§āĻ° āĻ¸āĻŽā§āĻĒā§āĻ°ā§āĻŖ json āĻĄāĻžāĻŽā§āĻĒ āĻāĻ°āĻŋāĻ¨āĻŋ, āĻ¤āĻŦā§ āĻļā§āĻ§ā§āĻŽāĻžāĻ¤ā§āĻ° āĻāĻŽāĻžāĻ° āĻĒā§āĻ°āĻ¯āĻŧā§āĻāĻ¨ā§āĻ¯āĻŧ āĻā§āĻˇā§āĻ¤ā§āĻ°āĻā§āĻ˛āĻŋ āĻ¸āĻāĻ°āĻā§āĻˇāĻŖ āĻāĻ°ā§āĻāĻŋ:
- id
- is_tutorial
- āĻ¸āĻŽāĻ¯āĻŧ_āĻĒā§āĻ°āĻāĻžāĻļāĻŋāĻ¤
- āĻā§āĻ¤āĻžāĻŦ
- āĻŦāĻŋāĻˇāĻ¯āĻŧāĻŦāĻ¸ā§āĻ¤ā§
- āĻŽāĻ¨ā§āĻ¤āĻŦā§āĻ¯_āĻāĻŖāĻ¨āĻž
- lang āĻšāĻ˛ āĻ¸ā§āĻ āĻāĻžāĻˇāĻž āĻ¯ā§āĻāĻžāĻ¨ā§ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§āĻāĻŋ āĻ˛ā§āĻāĻž āĻšāĻ¯āĻŧāĨ¤ āĻāĻāĻ¨ āĻĒāĻ°ā§āĻ¯āĻ¨ā§āĻ¤, āĻāĻāĻŋ āĻļā§āĻ§ā§āĻŽāĻžāĻ¤ā§āĻ° en āĻāĻŦāĻ ru āĻāĻā§āĨ¤
- tags_string - āĻĒā§āĻ¸ā§āĻ āĻĨā§āĻā§ āĻ¸āĻŽāĻ¸ā§āĻ¤ āĻā§āĻ¯āĻžāĻ
- āĻĒāĻĄāĻŧāĻž_āĻāĻŖāĻ¨āĻž
- āĻ˛ā§āĻāĻ
- āĻ¸ā§āĻā§āĻ° - āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ āĻ°ā§āĻāĻŋāĻāĨ¤
āĻāĻāĻāĻžāĻŦā§, API āĻŦā§āĻ¯āĻŦāĻšāĻžāĻ° āĻāĻ°ā§, āĻāĻŽāĻŋ āĻĒā§āĻ°āĻ¤āĻŋ 8 āĻāĻāĻāĻ°āĻāĻ˛ā§ 100 āĻ¸ā§āĻā§āĻ¨ā§āĻĄā§ āĻ¸ā§āĻā§āĻ°āĻŋāĻĒā§āĻ āĻāĻžāĻ°ā§āĻ¯āĻāĻ° āĻāĻ°āĻžāĻ° āĻ¸āĻŽāĻ¯āĻŧ āĻāĻŽāĻŋāĻ¯āĻŧā§āĻāĻŋāĨ¤
āĻāĻŽāĻžāĻĻā§āĻ° āĻĒā§āĻ°āĻ¯āĻŧā§āĻāĻ¨ā§āĻ¯āĻŧ āĻĄā§āĻāĻž āĻĄāĻžāĻāĻ¨āĻ˛ā§āĻĄ āĻāĻ°āĻžāĻ° āĻĒāĻ°ā§, āĻāĻŽāĻžāĻĻā§āĻ° āĻāĻāĻŋ āĻĒā§āĻ°āĻā§āĻ°āĻŋāĻ¯āĻŧāĻž āĻāĻ°āĻ¤ā§ āĻšāĻŦā§ āĻāĻŦāĻ āĻĄāĻžāĻāĻžāĻŦā§āĻ¸ā§ āĻĒā§āĻ°āĻŦā§āĻļ āĻāĻ°āĻ¤ā§ āĻšāĻŦā§āĨ¤ āĻāĻ¤ā§āĻ āĻāĻŽāĻžāĻ° āĻā§āĻ¨ā§ āĻ¸āĻŽāĻ¸ā§āĻ¯āĻž āĻšāĻ¯āĻŧāĻ¨āĻŋ:
parser.py
import json
import sqlite3
import logging
from datetime import datetime
def parser(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute('PRAGMA synchronous = 0') # ĐŅĐēĐģŅŅĐ°ĐĩĐŧ ĐŋОдŅвĐĩŅĐļĐ´ĐĩĐŊиĐĩ СаĐŋиŅи, ŅĐ°Đē ŅĐēĐžŅĐžŅŅŅ ŅвĐĩĐģиŅиваĐĩŅŅŅ в ŅаСŅ.
c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT,
lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
try:
for i in range(min, max):
try:
filename = "files\{}.json".format(i)
f = open(filename)
data = json.load(f)
(id, is_tutorial, time_published, title, content, comments_count, lang,
tags_string, reading_count, author, score) = data
# Ради ĐģŅŅŅĐĩĐš ŅиŅĐ°ĐĩĐŧĐžŅŅи йаСŅ ĐŧĐžĐļĐŊĐž ĐŋŅĐĩĐŊĐĩĐąŅĐĩŅŅ ŅиŅĐ°ĐĩĐŧĐžŅŅŅŅ ĐēОда. ĐĐģи ĐŊĐĩŅ?
# ĐŅĐģи ваĐŧ ŅĐ°Đē ĐēĐ°ĐļĐĩŅŅŅ, ĐŧĐžĐļĐŊĐž ĐŋŅĐžŅŅĐž СаĐŧĐĩĐŊиŅŅ ĐēĐžŅŅĐĩĐļ Đ°ŅĐŗŅĐŧĐĩĐŊŅĐžĐŧ data. Đ ĐĩŅĐ°ŅŅ ваĐŧ.
c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
title, content, lang,
comments_count, reading_count,
score, is_tutorial,
tags_string))
f.close()
except IOError:
logging.info('FileNotExists')
continue
finally:
conn.commit()
start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)
ĐĄŅĐ°ŅиŅŅиĐēĐ°
āĻāĻžāĻ˛, āĻāĻ¤āĻŋāĻšā§āĻ¯āĻāĻ¤āĻāĻžāĻŦā§, āĻ āĻŦāĻļā§āĻˇā§, āĻāĻĒāĻ¨āĻŋ āĻĄā§āĻāĻž āĻĨā§āĻā§ āĻāĻŋāĻā§ āĻĒāĻ°āĻŋāĻ¸āĻāĻā§āĻ¯āĻžāĻ¨ āĻŦā§āĻ° āĻāĻ°āĻ¤ā§ āĻĒāĻžāĻ°ā§āĻ¨:
- āĻĒā§āĻ°āĻ¤ā§āĻ¯āĻžāĻļāĻŋāĻ¤ 490 āĻĄāĻžāĻāĻ¨āĻ˛ā§āĻĄā§āĻ° āĻŽāĻ§ā§āĻ¯ā§, āĻļā§āĻ§ā§āĻŽāĻžāĻ¤ā§āĻ° 406 āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ āĻĄāĻžāĻāĻ¨āĻ˛ā§āĻĄ āĻāĻ°āĻž āĻšāĻ¯āĻŧā§āĻā§āĨ¤ āĻĻā§āĻāĻž āĻ¯āĻžāĻā§āĻā§ āĻ¯ā§ āĻšāĻžāĻŦāĻ°ā§ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§āĻā§āĻ˛āĻŋāĻ° āĻ āĻ°ā§āĻ§ā§āĻā§āĻ°āĻ āĻŦā§āĻļāĻŋ (228) āĻ˛ā§āĻāĻžāĻ¨ā§ āĻŦāĻž āĻŽā§āĻā§ āĻĢā§āĻ˛āĻž āĻšāĻ¯āĻŧā§āĻā§āĨ¤
- āĻĒā§āĻ°āĻžāĻ¯āĻŧ āĻ āĻ°ā§āĻ§ āĻŽāĻŋāĻ˛āĻŋāĻ¯āĻŧāĻ¨ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ āĻ¸āĻŽāĻ¨ā§āĻŦāĻŋāĻ¤ āĻ¸āĻŽāĻā§āĻ° āĻĄāĻžāĻāĻžāĻŦā§āĻ¸āĻāĻŋāĻ° āĻāĻāĻ¨ 2.95 āĻāĻŋāĻŦāĻŋāĨ¤ āĻ¸āĻāĻā§āĻāĻŋāĻ¤ āĻāĻāĻžāĻ°ā§ - 495 āĻāĻŽāĻŦāĻŋāĨ¤
- āĻŽā§āĻ, 37804 āĻāĻ¨ HabrÊ āĻāĻ° āĻ˛ā§āĻāĻāĨ¤ āĻāĻŽāĻŋ āĻāĻĒāĻ¨āĻžāĻā§ āĻŽāĻ¨ā§ āĻāĻ°āĻŋāĻ¯āĻŧā§ āĻĻāĻŋāĻā§āĻāĻŋ āĻ¯ā§ āĻāĻ āĻĒāĻ°āĻŋāĻ¸āĻāĻā§āĻ¯āĻžāĻ¨āĻā§āĻ˛āĻŋ āĻļā§āĻ§ā§āĻŽāĻžāĻ¤ā§āĻ° āĻ˛āĻžāĻāĻ āĻĒā§āĻ¸ā§āĻ āĻĨā§āĻā§āĨ¤
- āĻšāĻžāĻŦā§āĻ°ā§-āĻāĻ° āĻ¸āĻŦāĻā§āĻ¯āĻŧā§ āĻā§āĻĒāĻžāĻĻāĻ¨āĻļā§āĻ˛ āĻ˛ā§āĻāĻ-
āĻāĻ˛āĻŋāĻāĻžāĻ° - 8774āĻāĻŋ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§āĨ¤ āĻļā§āĻ°ā§āĻˇ āĻ°ā§āĻ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ â 1448 āĻĒā§āĻ˛āĻžāĻ¸āĻ¸āĻ°ā§āĻŦāĻžāĻ§āĻŋāĻ āĻĒāĻ āĻŋāĻ¤ āĻ¨āĻŋāĻŦāĻ¨ā§āĻ§ â 1660841 āĻŦāĻžāĻ° āĻĻā§āĻāĻž āĻšāĻ¯āĻŧā§āĻā§āĻ¸āĻ°ā§āĻŦāĻžāĻ§āĻŋāĻ āĻāĻ˛ā§āĻāĻŋāĻ¤ āĻĒā§āĻ°āĻŦāĻ¨ā§āĻ§ â 2444 āĻŽāĻ¨ā§āĻ¤āĻŦā§āĻ¯
āĻāĻžāĻ˛, āĻļā§āĻ°ā§āĻˇ āĻāĻāĻžāĻ°ā§āĻļā§āĻ°ā§āĻˇ 15 āĻ˛ā§āĻāĻ
āĻ°ā§āĻāĻŋāĻ āĻĻā§āĻŦāĻžāĻ°āĻž āĻļā§āĻ°ā§āĻˇ 15
āĻļā§āĻ°ā§āĻˇ 15 āĻĒāĻĄāĻŧāĻž
āĻļā§āĻ°ā§āĻˇ 15 āĻāĻ˛ā§āĻāĻŋāĻ¤
āĻāĻ¤ā§āĻ¸: www.habr.com