Habr αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αž“αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž˜αž½αž™αŸ”

αž’αžšαž»αžŽαžŸαž½αžŸαŸ’αžαžΈαŸ” αžœαžΆαž˜αžΆαž“αžšαž™αŸˆαž–αŸαž› 2 αž†αŸ’αž“αžΆαŸ†αž αžΎαž™αž…αžΆαž”αŸ‹αžαžΆαŸ†αž„αž–αžΈαžαŸ’αž‰αž»αŸ†αžŸαžšαžŸαŸαžšαžœαžΆαŸ” αž’αžαŸ’αžαž”αž‘αž…αž»αž„αž€αŸ’αžšαŸ„αž™ αž’αŸ†αž–αžΈαž€αžΆαžšαž‰αŸ‚αž€ Habr αž αžΎαž™αžšαžΏαž„αžαŸ’αž›αŸ‡αž”αžΆαž“αž•αŸ’αž›αžΆαžŸαŸ‹αž”αŸ’αžαžΌαžšαŸ”

αž“αŸ…αž–αŸαž›αžŠαŸ‚αž›αžαŸ’αž‰αž»αŸ†αž…αž„αŸ‹αž˜αžΆαž“αž…αŸ’αž”αžΆαž”αŸ‹αž…αž˜αŸ’αž›αž„αž“αŸƒ Habr αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αžŸαž˜αŸ’αžšαŸαž…αž…αž·αžαŸ’αžαžŸαžšαžŸαŸαžš parser αžŠαŸ‚αž›αž“αžΉαž„αžšαž€αŸ’αžŸαžΆαž‘αž»αž€αž˜αžΆαžαž·αž€αžΆαž‘αžΆαŸ†αž„αž’αžŸαŸ‹αžšαž”αžŸαŸ‹αž’αŸ’αž“αž€αž“αž·αž–αž“αŸ’αž’αž‘αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αŸ” αžšαž”αŸ€αž”αžŠαŸ‚αž›αžœαžΆαž€αžΎαžαž‘αžΎαž„αž“αž·αž„αž€αŸ†αž αž»αžŸαž’αŸ’αžœαžΈαžŠαŸ‚αž›αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αž‡αž½αž”αž”αŸ’αžšαž‘αŸ‡ - αž’αŸ’αž“αž€αž’αžΆαž…αž’αžΆαž“αž“αŸ…αž€αŸ’αžšαŸ„αž˜αž€αžΆαžšαž€αžΆαžαŸ‹αŸ”

TLDR- αž—αŸ’αž‡αžΆαž”αŸ‹αž‘αŸ…αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™

αž€αŸ†αžŽαŸ‚αžŠαŸ†αž”αžΌαž„αž“αŸƒαž§αž”αž€αžšαžŽαŸαž‰αŸ‚αž€αŸ” αžαŸ’αžŸαŸ‚αžαŸ‚αž˜αž½αž™αž”αž‰αŸ’αž αžΆαž‡αžΆαž…αŸ’αžšαžΎαž“αŸ”

αžŠαžΎαž˜αŸ’αž”αžΈαž…αžΆαž”αŸ‹αž•αŸ’αžαžΎαž˜ αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αžŸαž˜αŸ’αžšαŸαž…αž…αž·αžαŸ’αžαž”αž„αŸ’αž€αžΎαžαž‚αŸ†αžšαžΌαžŠαžΎαž˜αž“αŸƒαžŸαŸ’αž‚αŸ’αžšαžΈαž” αžŠαŸ‚αž›αž—αŸ’αž›αžΆαž˜αŸ—αž”αž“αŸ’αž‘αžΆαž”αŸ‹αž–αžΈαž€αžΆαžšαž‘αžΆαž‰αž™αž€ αž’αžαŸ’αžαž”αž‘αž“αžΉαž„αžαŸ’αžšαžΌαžœαž”αžΆαž“αž‰αŸ‚αž€ αž αžΎαž™αžŠαžΆαž€αŸ‹αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αŸ” αžŠαŸ„αž™αž˜αž·αž“αž‚αž·αžαž–αžΈαžšαžŠαž„ αžαŸ’αž‰αž»αŸ†αž”αŸ’αžšαžΎ sqlite3 αž–αŸ’αžšαŸ„αŸ‡... αžœαžΆαž˜αž·αž“αžŸαžΌαžœαž”αŸ’αžšαžΎαž€αž˜αŸ’αž›αžΆαŸ†αž„αž–αž›αž€αž˜αŸ’αž˜αž‘αŸαŸ– αž’αŸ’αž“αž€αž˜αž·αž“αž…αžΆαŸ†αž”αžΆαž…αŸ‹αž˜αžΆαž“αž˜αŸ‰αžΆαžŸαŸŠαžΈαž“αž˜αŸαž€αŸ’αž“αž»αž„αžŸαŸ’αžšαž»αž€ αž”αž„αŸ’αž€αžΎαž αž˜αžΎαž› αž›αž»αž” αž“αž·αž„αž’αŸ’αžœαžΈαŸ—αžŠαžΌαž…αž“αŸ„αŸ‡αž‘αŸαŸ”

one_thread.py

from bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime

def main(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content  TEXT, tags TEXT)")

    start_time = datetime.now()
    c.execute("begin")
    for i in range(min, max):
        url = "https://m.habr.com/post/{}".format(i)
        try:
            r = requests.get(url)
        except:
            with open("req_errors.txt") as file:
                file.write(i)
            continue
        if(r.status_code != 200):
            print("{} - {}".format(i, r.status_code))
            continue

        html_doc = r.text
        soup = BeautifulSoup(html_doc, 'html.parser')

        try:
            author = soup.find(class_="tm-user-info__username").get_text()
            content = soup.find(id="post-content-body")
            content = str(content)
            title = soup.find(class_="tm-article-title__text").get_text()
            tags = soup.find(class_="tm-article__tags").get_text()
            tags = tags[5:]
        except:
            author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
            content = "ΠŸΡ€ΠΈ парсингС этой страницС ΠΏΡ€ΠΎΠΈΠ·ΠΎΡˆΠ»Π° ошибка."

        c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
        print(i)
    c.execute("commit")
    print(datetime.now() - start_time)

main(1, 490406)

αž’αŸ’αžœαžΈαž‚αŸ’αžšαž”αŸ‹αž™αŸ‰αžΆαž„αž‚αžΊαž™αŸ„αž„αž‘αŸ…αžαžΆαž˜αž”αž»αžšαžΆαžŽ - αž™αžΎαž„αž”αŸ’αžšαžΎαžŸαŸŠαž»αž”αžŠαŸαžŸαŸ’αžšαžŸαŸ‹αžŸαŸ’αž’αžΆαžαžŸαŸ†αžŽαžΎαž“αž·αž„αž‚αŸ†αžšαžΌαžšαž αŸαžŸαž‚αžΊαžšαž½αž…αžšαžΆαž›αŸ‹αŸ” αž“αŸ„αŸ‡αž‚αŸ’αžšαžΆαž“αŸ‹αžαŸ‚αž‡αžΆ...

  • αž‘αŸ†αž–αŸαžšαžαŸ’αžšαžΌαžœαž”αžΆαž“αž‘αžΆαž‰αž™αž€αž€αŸ’αž“αž»αž„αžαŸ’αžŸαŸ‚αžαŸ‚αž˜αž½αž™

  • αž”αŸ’αžšαžŸαž·αž“αž”αžΎαž’αŸ’αž“αž€αžšαŸ†αžαžΆαž“αž€αžΆαžšαž”αŸ’αžšαžαž·αž”αžαŸ’αžαž·αž“αŸƒαžŸαŸ’αž‚αŸ’αžšαžΈαž” αž“αŸ„αŸ‡αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž‘αžΆαŸ†αž„αž˜αžΌαž›αž“αžΉαž„αž˜αž·αž“αž‘αŸ…αžŽαžΆαž‘αŸαŸ” αž”αž“αŸ’αž‘αžΆαž”αŸ‹αž–αžΈαž‘αžΆαŸ†αž„αž’αžŸαŸ‹ αž€αžΆαžšαž”αŸ’αžαŸαž‡αŸ’αž‰αžΆαž…αž·αžαŸ’αžαžαŸ’αžšαžΌαžœαž”αžΆαž“αž”αŸ’αžšαžαž·αž”αžαŸ’αžαž·αžαŸ‚αž”αž“αŸ’αž‘αžΆαž”αŸ‹αž–αžΈαž€αžΆαžšαžœαž·αž—αžΆαž‚αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αŸ”
    αž‡αžΆβ€‹αž€αžΆαžšβ€‹αž–αž·αžβ€‹αžŽαžΆαžŸαŸ‹ αž’αŸ’αž“αž€β€‹αž’αžΆαž…β€‹αž’αŸ’αžœαžΎβ€‹αž€αžΆαžšβ€‹αž•αŸ’αž›αžΆαžŸαŸ‹β€‹αž”αŸ’αžαžΌβ€‹αžšβ€‹αž‘αŸ…β€‹αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“β€‹αž‘αž·αž“αŸ’αž“αž“αŸαž™β€‹αž”αž“αŸ’αž‘αžΆαž”αŸ‹β€‹αž–αžΈβ€‹αž€αžΆαžšβ€‹αž”αž‰αŸ’αž…αžΌαž›β€‹αž‚αŸ’αž“αžΆ αž”αŸ‰αž»αž“αŸ’αžαŸ‚β€‹αž”αž“αŸ’αž‘αžΆαž”αŸ‹β€‹αž˜αž€β€‹αž–αŸαž›β€‹αžœαŸαž›αžΆβ€‹αž”αŸ’αžšαžαž·αž”αžαŸ’αžαž·β€‹αžŸαŸ’αž‚αŸ’αžšαžΈαž”β€‹αž“αžΉαž„β€‹αž€αžΎαž“β€‹αž‘αžΎαž„β€‹αž™αŸ‰αžΆαž„β€‹αžαŸ’αž›αžΆαŸ†αž„αŸ”

  • αž€αžΆαžšαž‰αŸ‚αž€αž’αžαŸ’αžαž”αž‘ 100 αžŠαŸ†αž”αžΌαž„αž”αžΆαž“αž…αŸ†αžŽαžΆαž™αž–αŸαž› 000 αž˜αŸ‰αŸ„αž„αŸ”

αž”αž“αŸ’αž‘αžΆαž”αŸ‹αž˜αž€αžαŸ’αž‰αž»αŸ†αžšαž€αžƒαžΎαž‰αž’αžαŸ’αžαž”αž‘αžšαž”αžŸαŸ‹αž’αŸ’αž“αž€αž”αŸ’αžšαžΎ αžšαž½αž˜αž”αž‰αŸ’αž…αžΌαž›αž‚αŸ’αž“αžΆαžŠαŸ‚αž›αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αž’αžΆαž“ αž“αž·αž„αž”αžΆαž“αžšαž€αžƒαžΎαž‰αž€αžΆαžšαž›αž½αž…αž…αžΌαž›αž‡αžΈαžœαž·αžαž‡αžΆαž…αŸ’αžšαžΎαž“ αžŠαžΎαž˜αŸ’αž”αžΈαž”αž„αŸ’αž€αžΎαž“αž›αŸ’αž”αžΏαž“αžŠαŸ†αžŽαžΎαžšαž€αžΆαžšαž“αŸαŸ‡αŸ–

  • αž€αžΆαžšαž”αŸ’αžšαžΎαž”αŸ’αžšαžΆαžŸαŸ‹ multithreading αž”αž„αŸ’αž€αžΎαž“αž›αŸ’αž”αžΏαž“αž€αžΆαžšαž‘αžΆαž‰αž™αž€αž™αŸ‰αžΆαž„αžαŸ’αž›αžΆαŸ†αž„αŸ”
  • αž’αŸ’αž“αž€β€‹αž˜αž·αž“β€‹αž’αžΆαž…β€‹αž‘αž‘αž½αž›β€‹αž€αŸ†αžŽαŸ‚β€‹αž–αŸαž‰β€‹αž›αŸαž‰β€‹αžšαž”αžŸαŸ‹ Habr αž‘αŸ αž”αŸ‰αž»αž“αŸ’αžαŸ‚β€‹αž‡αžΆβ€‹αž€αŸ†αžŽαŸ‚β€‹αž‘αžΌαžšαžŸαž–αŸ’αž‘β€‹αžšαž”αžŸαŸ‹β€‹αžœαžΆαŸ”
    αž§αž‘αžΆαž αžšαžŽαŸ αž”αŸ’αžšαžŸαž·αž“αž”αžΎαž’αžαŸ’αžαž”αž‘αžšαž½αž˜αž”αž‰αŸ’αž…αžΌαž›αž‚αŸ’αž“αžΆαž“αŸ…αž€αŸ’αž“αž»αž„αž€αŸ†αžŽαŸ‚αž•αŸ’αž‘αŸƒαžαž»αž˜αžΆαž“αž‘αž˜αŸ’αž„αž“αŸ‹ 378 KB αž“αŸ„αŸ‡αž“αŸ…αž€αŸ’αž“αž»αž„αž€αŸ†αžŽαŸ‚αž…αž›αŸαžαžœαžΆαž˜αžΆαž“ 126 KB αžšαž½αž…αž αžΎαž™αŸ”

αž€αŸ†αžŽαŸ‚αž‘αžΈαž–αžΈαžšαŸ” αžαŸ’αžŸαŸ‚αžŸαŸ’αžšαž‘αžΆαž™αž‡αžΆαž…αŸ’αžšαžΎαž“ αž€αžΆαžšαž αžΆαž˜αžƒαžΆαžαŸ‹αž”αžŽαŸ’αžαŸ„αŸ‡αž’αžΆαžŸαž“αŸ’αž“αž–αžΈ Habr

αž“αŸ…αž–αŸαž›αžŠαŸ‚αž›αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αž˜αžΎαž›αž’αŸŠαžΈαž“αž’αžΊαžŽαž·αžαž›αžΎαž”αŸ’αžšαž’αžΆαž“αž”αž‘αž“αŸƒ multithreading αž“αŸ…αž€αŸ’αž“αž»αž„ python αž αžΎαž™αž‡αŸ’αžšαžΎαžŸαžšαžΎαžŸαž‡αž˜αŸ’αžšαžΎαžŸαžŠαŸαžŸαžΆαž˜αž‰αŸ’αž‰αž”αŸ†αž•αž»αžαž‡αžΆαž˜αž½αž™ multiprocessing.dummy αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αž€αžαŸ‹αžŸαž˜αŸ’αž‚αžΆαž›αŸ‹αžαžΆαž”αž‰αŸ’αž αžΆαž”αžΆαž“αž›αŸαž…αž‘αžΎαž„αžšαž½αž˜αž‡αžΆαž˜αž½αž™ multithreading αŸ”

SQLite3 αž˜αž·αž“αž…αž„αŸ‹αž’αŸ’αžœαžΎαž€αžΆαžšαž‡αžΆαž˜αž½αž™αžαŸ’αžŸαŸ‚αžŸαŸ’αžšαž‘αžΆαž™αž…αŸ’αžšαžΎαž“αž‡αžΆαž„αž˜αž½αž™αŸ”.
αž‡αž½αžŸαž‡αž»αž› check_same_thread=Falseαž”αŸ‰αž»αž“αŸ’αžαŸ‚αž€αŸ†αž αž»αžŸαž“αŸαŸ‡αž˜αž·αž“αž˜αŸ‚αž“αžαŸ‚αž˜αž½αž™αž‘αŸ αž“αŸ…αž–αŸαž›αžŠαŸ‚αž›αž–αŸ’αž™αžΆαž™αžΆαž˜αž”αž‰αŸ’αž…αžΌαž›αž‘αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™ αž–αŸαž›αžαŸ’αž›αŸ‡αž€αŸ†αž αž»αžŸαž€αžΎαžαž‘αžΎαž„αžŠαŸ‚αž›αžαŸ’αž‰αž»αŸ†αž˜αž·αž“αž’αžΆαž…αžŠαŸ„αŸ‡αžŸαŸ’αžšαžΆαž™αž”αžΆαž“αŸ”

αžŠαžΌαž…αŸ’αž“αŸαŸ‡αž αžΎαž™ αžαŸ’αž‰αž»αŸ†αžŸαž˜αŸ’αžšαŸαž…αž…αž·αžαŸ’αžαž”αŸ„αŸ‡αž”αž„αŸ‹αž€αžΆαžšαž”αž‰αŸ’αž…αžΌαž›αž’αžαŸ’αžαž”αž‘αž—αŸ’αž›αžΆαž˜αŸ—αž‘αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™ αž αžΎαž™αžŠαŸ„αž™αž“αžΉαž€αžƒαžΎαž‰αžŠαŸ†αžŽαŸ„αŸ‡αžŸαŸ’αžšαžΆαž™αžšαž½αž˜αž”αž‰αŸ’αž…αžΌαž›αž‚αŸ’αž“αžΆ αžαŸ’αž‰αž»αŸ†αžŸαž˜αŸ’αžšαŸαž…αž…αž·αžαŸ’αžαž”αŸ’αžšαžΎαž―αž€αžŸαžΆαžš αž–αŸ’αžšαŸ„αŸ‡αž˜αž·αž“αž˜αžΆαž“αž”αž‰αŸ’αž αžΆαž‡αžΆαž˜αž½αž™αž€αžΆαžšαžŸαžšαžŸαŸαžšαž…αŸ’αžšαžΎαž“αžαŸ’αžŸαŸ‚αž‘αŸ…αž―αž€αžŸαžΆαžšαž‘αŸαŸ”

Habr αž…αžΆαž”αŸ‹αž•αŸ’αžαžΎαž˜αž αžΆαž˜αž”αŸ’αžšαžΆαž˜αžŸαž˜αŸ’αžšαžΆαž”αŸ‹αž€αžΆαžšαž”αŸ’αžšαžΎαžαŸ’αžŸαŸ‚αž›αžΎαžŸαž–αžΈαž”αžΈ.
αž€αžΆαžšαž–αŸ’αž™αžΆαž™αžΆαž˜αžŠαŸ„αž™αž§αžŸαŸ’αžŸαžΆαž αŸαž–αŸ’αž™αžΆαž™αžΆαž˜αž‡αžΆαž–αž·αžŸαŸαžŸαžŠαžΎαž˜αŸ’αž”αžΈαž‘αŸ…αžŠαž›αŸ‹ Habr αž’αžΆαž…αž”αžŽαŸ’αžαžΆαž›αž±αŸ’αž™αž˜αžΆαž“αž€αžΆαžšαž αžΆαž˜αžƒαžΆαžαŸ‹ IP αžŸαž˜αŸ’αžšαžΆαž”αŸ‹αž–αžΈαžšαž”αžΈαž˜αŸ‰αŸ„αž„αŸ” αžŠαžΌαž…αŸ’αž“αŸαŸ‡αž’αŸ’αž“αž€αžαŸ’αžšαžΌαžœαž”αŸ’αžšαžΎαžαŸ‚ 3 αžαŸ’αžŸαŸ‚ αž”αŸ‰αž»αž“αŸ’αžαŸ‚αž“αŸαŸ‡αž‚αžΊαž›αŸ’αž’αžšαž½αž…αž‘αŸ…αž αžΎαž™ αž…αžΆαž”αŸ‹αžαžΆαŸ†αž„αž–αžΈαž–αŸαž›αžŠαžΎαž˜αŸ’αž”αžΈαžαž˜αŸ’αžšαŸ€αž”αžαžΆαž˜ 100 αž’αžαŸ’αžαž”αž‘αžαŸ’αžšαžΌαžœαž”αžΆαž“αž€αžΆαžαŸ‹αž”αž“αŸ’αžαž™αž–αžΈ 26 αž‘αŸ… 12 αžœαž·αž“αžΆαž‘αžΈαŸ”

αžœαžΆαž‚αž½αžšαž±αŸ’αž™αž€αžαŸ‹αžŸαž˜αŸ’αž‚αžΆαž›αŸ‹αžαžΆαž€αŸ†αžŽαŸ‚αž“αŸαŸ‡αž‚αžΊαž˜αž·αž“αžŸαŸ’αžαž·αžαžŸαŸ’αžαŸαžšαž‘αŸαž αžΎαž™αž€αžΆαžšαž‘αžΆαž‰αž™αž€αž‡αžΆαž‘αŸ€αž„αž‘αžΆαžαŸ‹αž”αžšαžΆαž‡αŸαž™αž›αžΎαž’αžαŸ’αžαž”αž‘αž˜αž½αž™αž…αŸ†αž“αž½αž“αž’αŸ†αŸ”

async_v1.py

from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/post/{}".format(i)

    try: r = requests.get(url)
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    # Π—Π°ΠΏΠΈΡΡŒ Π·Π°Π±Π»ΠΎΠΊΠΈΡ€ΠΎΠ²Π°Π½Π½Ρ‹Ρ… запросов Π½Π° сСрвСр
    if (r.status_code == 503):
        with open("Error503.txt", "a") as write_file:
            write_file.write(str(i) + "n")
            logging.warning('{} / 503 Error'.format(i))

    # Если поста Π½Π΅ сущСствуСт ΠΈΠ»ΠΈ ΠΎΠ½ Π±Ρ‹Π» скрыт
    if (r.status_code != 200):
        logging.info("{} / {} Code".format(i, r.status_code))
        return r.status_code

    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html5lib')

    try:
        author = soup.find(class_="tm-user-info__username").get_text()

        timestamp = soup.find(class_='tm-user-meta__date')
        timestamp = timestamp['title']

        content = soup.find(id="post-content-body")
        content = str(content)
        title = soup.find(class_="tm-article-title__text").get_text()
        tags = soup.find(class_="tm-article__tags").get_text()
        tags = tags[5:]

        # ΠœΠ΅Ρ‚ΠΊΠ°, Ρ‡Ρ‚ΠΎ пост являСтся ΠΏΠ΅Ρ€Π΅Π²ΠΎΠ΄ΠΎΠΌ ΠΈΠ»ΠΈ Ρ‚ΡƒΡ‚ΠΎΡ€ΠΈΠ°Π»ΠΎΠΌ.
        tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()

        rating = soup.find(class_="tm-votes-score").get_text()
    except:
        author = title = tags = timestamp = tm_tag = rating = "Error" 
        content = "ΠŸΡ€ΠΈ парсингС этой страницС ΠΏΡ€ΠΎΠΈΠ·ΠΎΡˆΠ»Π° ошибка."
        logging.warning("Error parsing - {}".format(i))
        with open("Errors.txt", "a") as write_file:
            write_file.write(str(i) + "n")

    # ЗаписываСм ΡΡ‚Π°Ρ‚ΡŒΡŽ Π² json
    try:
        article = [i, timestamp, author, title, content, tm_tag, rating, tags]
        with open(currentFile, "w") as write_file:
            json.dump(article, write_file)
    except:
        print(i)
        raise

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("НСобходимы ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€Ρ‹ min ΠΈ max. ИспользованиС: async_v1.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    # Если ΠΏΠΎΡ‚ΠΎΠΊΠΎΠ² >3
    # Ρ‚ΠΎ Ρ…Π°Π±Ρ€ Π±Π°Π½ΠΈΡ‚ ipшник Π½Π° врСмя
    pool = ThreadPool(3)

    # ΠžΡ‚ΡΡ‡Π΅Ρ‚ Π²Ρ€Π΅ΠΌΠ΅Π½ΠΈ, запуск ΠΏΠΎΡ‚ΠΎΠΊΠΎΠ²
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    # ПослС закрытия всСх ΠΏΠΎΡ‚ΠΎΠΊΠΎΠ² ΠΏΠ΅Ρ‡Π°Ρ‚Π°Π΅ΠΌ врСмя
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

αž€αŸ†αžŽαŸ‚αž‘αžΈαž”αžΈαŸ” αž…αž»αž„αž€αŸ’αžšαŸ„αž™

αžαžŽαŸˆαž–αŸαž›αžŠαŸ‚αž›αž€αŸ†αž–αž»αž„αž”αŸ†αž”αžΆαžαŸ‹αž€αŸ†αž αž»αžŸαž€αŸ†αžŽαŸ‚αž‘αžΈαž–αžΈαžš αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αžšαž€αžƒαžΎαž‰αžαžΆ Habr αžŸαŸ’αžšαžΆαž”αŸ‹αžαŸ‚αž˜αžΆαž“ API αžŠαŸ‚αž›αžαŸ’αžšαžΌαžœαž”αžΆαž“αž…αžΌαž›αž”αŸ’αžšαžΎαžŠαŸ„αž™αž€αŸ†αžŽαŸ‚αž…αž›αŸαžαžšαž”αžŸαŸ‹αž‚αŸαž αž‘αŸ†αž–αŸαžšαŸ” αžœαžΆαž•αŸ’αž‘αž»αž€αž›αžΏαž“αž‡αžΆαž„αž€αŸ†αžŽαŸ‚αž…αž›αŸαžαž–αŸ’αžšαŸ„αŸ‡αžœαžΆαž‚αŸ’αžšαžΆαž“αŸ‹αžαŸ‚αž‡αžΆ json αžŠαŸ‚αž›αž˜αž·αž“αž…αžΆαŸ†αž”αžΆαž…αŸ‹αžαŸ’αžšαžΌαžœαž”αžΆαž“αž‰αŸ‚αž€αŸ” αž“αŸ…β€‹αž‘αžΈβ€‹αž”αž‰αŸ’αž…αž”αŸ‹ αžαŸ’αž‰αž»αŸ†β€‹αž”αžΆαž“β€‹αžŸαž˜αŸ’αžšαŸαž…β€‹αž…αž·αžαŸ’αžβ€‹αžŸαžšαžŸαŸαžšβ€‹αž’αž€αŸ’αžŸαžšβ€‹αžšαž”αžŸαŸ‹β€‹αžαŸ’αž‰αž»αŸ†β€‹αž˜αŸ’αžŠαž„β€‹αž‘αŸ€αžαŸ”

αžŠαžΌαž…αŸ’αž“αŸαŸ‡αžŠαŸ„αž™αž”αžΆαž“αžšαž€αžƒαžΎαž‰ αžαŸ†αžŽαž“αŸαŸ‡ API αž’αŸ’αž“αž€αž’αžΆαž…αž…αžΆαž”αŸ‹αž•αŸ’αžαžΎαž˜αž‰αŸ‚αž€αžœαžΆαž”αžΆαž“αŸ”

async_v2.py

import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)

    try:
        r = requests.get(url)
        if r.status_code == 503:
            logging.critical("503 Error")
            return 503
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    data = json.loads(r.text)

    if data['success']:
        article = data['data']['article']

        id = article['id']
        is_tutorial = article['is_tutorial']
        time_published = article['time_published']
        comments_count = article['comments_count']
        lang = article['lang']
        tags_string = article['tags_string']
        title = article['title']
        content = article['text_html']
        reading_count = article['reading_count']
        author = article['author']['login']
        score = article['voting']['score']

        data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
        with open(currentFile, "w") as write_file:
            json.dump(data, write_file)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("НСобходимы ΠΏΠ°Ρ€Π°ΠΌΠ΅Ρ‚Ρ€Ρ‹ min ΠΈ max. ИспользованиС: asyc.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    # Если ΠΏΠΎΡ‚ΠΎΠΊΠΎΠ² >3
    # Ρ‚ΠΎ Ρ…Π°Π±Ρ€ Π±Π°Π½ΠΈΡ‚ ipшник Π½Π° врСмя
    pool = ThreadPool(3)

    # ΠžΡ‚ΡΡ‡Π΅Ρ‚ Π²Ρ€Π΅ΠΌΠ΅Π½ΠΈ, запуск ΠΏΠΎΡ‚ΠΎΠΊΠΎΠ²
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    # ПослС закрытия всСх ΠΏΠΎΡ‚ΠΎΠΊΠΎΠ² ΠΏΠ΅Ρ‡Π°Ρ‚Π°Π΅ΠΌ врСмя
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

αžœαžΆαž˜αžΆαž“αžœαžΆαž›αžŠαŸ‚αž›αž‘αžΆαž€αŸ‹αž‘αž„αž“αžΉαž„αž‘αžΆαŸ†αž„αž’αžαŸ’αžαž”αž‘αžαŸ’αž›αž½αž“αžœαžΆαž•αŸ’αž‘αžΆαž›αŸ‹ αž“αž·αž„αž’αŸ’αž“αž€αž“αž·αž–αž“αŸ’αž’αžŠαŸ‚αž›αž”αžΆαž“αžŸαžšαžŸαŸαžšαžœαžΆαŸ”

API.png

Habr αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αž“αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž˜αž½αž™αŸ”

αžαŸ’αž‰αž»αŸ†αž˜αž·αž“αž”αžΆαž“αž”αŸ„αŸ‡αž…αŸ„αž› json αž–αŸαž‰αž›αŸαž‰αž“αŸƒαž’αžαŸ’αžαž”αž‘αž“αžΈαž˜αž½αž™αŸ—αž‘αŸ αž”αŸ‰αž»αž“αŸ’αžαŸ‚αž”αžΆαž“αžšαž€αŸ’αžŸαžΆαž‘αž»αž€αžαŸ‚αžœαžΆαž›αžŠαŸ‚αž›αžαŸ’αž‰αž»αŸ†αžαŸ’αžšαžΌαžœαž€αžΆαžšαŸ–

  • id
  • αž‚αžΊ_αž€αžΆαžšαž”αž„αŸ’αžšαŸ€αž“
  • time_αž”αŸ„αŸ‡αž•αŸ’αžŸαžΆαž™
  • αž–αžΆαž“αžšαž„αŸ’αžœαžΆαž“αŸ‹
  • αž˜αžΆαžαž·αž€αžΆ
  • αž˜αžαž·αž™αŸ„αž”αž›αŸ‹_αžšαžΆαž”αŸ‹
  • lang αž‚αžΊαž‡αžΆαž—αžΆαžŸαžΆαžŠαŸ‚αž›αž’αžαŸ’αžαž”αž‘αžαŸ’αžšαžΌαžœαž”αžΆαž“αžŸαžšαžŸαŸαžšαŸ” αžšαž αžΌαžαž˜αž€αžŠαž›αŸ‹αž–αŸαž›αž“αŸαŸ‡αžœαžΆαž˜αžΆαž“αžαŸ‚ en αž“αž·αž„ ru αž”αŸ‰αž»αžŽαŸ’αžŽαŸ„αŸ‡αŸ”
  • tags_string β€” αžŸαŸ’αž›αžΆαž€αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αž–αžΈαž”αŸ’αžšαž€αžΆαžŸ
  • αž€αžΆαžšαž’αžΆαž“_αžšαžΆαž”αŸ‹
  • αž’αŸ’αž“αž€β€‹αž“αž·αž–αž“αŸ’αž’
  • αž–αž·αž“αŸ’αž‘αž» - αž…αŸ†αžŽαžΆαžαŸ‹αžαŸ’αž“αžΆαž€αŸ‹αž’αžαŸ’αžαž”αž‘αŸ”

αžŠαžΌαž…αŸ’αž“αŸαŸ‡αžŠαŸ„αž™αž”αŸ’αžšαžΎ API αžαŸ’αž‰αž»αŸ†αž”αžΆαž“αž€αžΆαžαŸ‹αž”αž“αŸ’αžαž™αž–αŸαž›αžœαŸαž›αžΆαžŠαŸ†αžŽαžΎαžšαž€αžΆαžšαžŸαŸ’αž‚αŸ’αžšαžΈαž”αž˜αž€αžαŸ’αžšαžΉαž˜ 8 αžœαž·αž“αžΆαž‘αžΈαž€αŸ’αž“αž»αž„ 100 urlαŸ”

αž”αž“αŸ’αž‘αžΆαž”αŸ‹αž–αžΈαž™αžΎαž„αž‘αžΆαž‰αž™αž€αž‘αž·αž“αŸ’αž“αž“αŸαž™αžŠαŸ‚αž›αž™αžΎαž„αžαŸ’αžšαžΌαžœαž€αžΆαžšαž αžΎαž™ αž™αžΎαž„αžαŸ’αžšαžΌαžœαžŠαŸ†αžŽαžΎαžšαž€αžΆαžšαžœαžΆ αž αžΎαž™αž”αž‰αŸ’αž…αžΌαž›αžœαžΆαž‘αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αŸ” αž˜αž·αž“αž˜αžΆαž“αž”αž‰αŸ’αž αžΆαž‡αžΆαž˜αž½αž™αž“αŸαŸ‡αž‘αŸαŸ–

parser.py

import json
import sqlite3
import logging
from datetime import datetime

def parser(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute('PRAGMA synchronous = 0') # ΠžΡ‚ΠΊΠ»ΡŽΡ‡Π°Π΅ΠΌ ΠΏΠΎΠ΄Ρ‚Π²Π΅Ρ€ΠΆΠ΄Π΅Π½ΠΈΠ΅ записи, Ρ‚Π°ΠΊ ΡΠΊΠΎΡ€ΠΎΡΡ‚ΡŒ увСличиваСтся Π² Ρ€Π°Π·Ρ‹.
    c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, 
    lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
    try:
        for i in range(min, max):
            try:
                filename = "files\{}.json".format(i)
                f = open(filename)
                data = json.load(f)

                (id, is_tutorial, time_published, title, content, comments_count, lang,
                 tags_string, reading_count, author, score) = data

                # Π Π°Π΄ΠΈ Π»ΡƒΡ‡ΡˆΠ΅ΠΉ читаСмости Π±Π°Π·Ρ‹ ΠΌΠΎΠΆΠ½ΠΎ ΠΏΡ€Π΅Π½Π΅Π±Ρ€Π΅Ρ‡ΡŒ Ρ‡ΠΈΡ‚Π°Π΅ΠΌΠΎΡΡ‚ΡŒΡŽ ΠΊΠΎΠ΄Π°. Или Π½Π΅Ρ‚?
                # Если Π²Π°ΠΌ Ρ‚Π°ΠΊ каТСтся, ΠΌΠΎΠΆΠ½ΠΎ просто Π·Π°ΠΌΠ΅Π½ΠΈΡ‚ΡŒ ΠΊΠΎΡ€Ρ‚Π΅ΠΆ Π°Ρ€Π³ΡƒΠΌΠ΅Π½Ρ‚ΠΎΠΌ data. Π Π΅ΡˆΠ°Ρ‚ΡŒ Π²Π°ΠΌ.

                c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
                                                                                        title, content, lang,
                                                                                        comments_count, reading_count,
                                                                                        score, is_tutorial,
                                                                                        tags_string))
                f.close()

            except IOError:
                logging.info('FileNotExists')
                continue

    finally:
        conn.commit()

start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)

αžŸαŸ’αžαž·αžαž·αŸ”

αž‡αžΆαž”αŸ’αžšαž–αŸƒαžŽαžΈ αž‡αžΆαž…αž»αž„αž€αŸ’αžšαŸ„αž™ αž’αŸ’αž“αž€αž’αžΆαž…αž‘αžΆαž‰αž™αž€αžŸαŸ’αžαž·αžαž·αž˜αž½αž™αž…αŸ†αž“αž½αž“αž–αžΈαž‘αž·αž“αŸ’αž“αž“αŸαž™αŸ–

  • αž€αŸ’αž“αž»αž„αž…αŸ†αžŽαŸ„αž˜ 490 αžŠαŸ‚αž›αžšαŸ†αž–αžΉαž„αž‘αž»αž€ αž˜αžΆαž“αžαŸ‚ 406 αž’αžαŸ’αžαž”αž‘αž”αŸ‰αž»αžŽαŸ’αžŽαŸ„αŸ‡αžŠαŸ‚αž›αžαŸ’αžšαžΌαžœαž”αžΆαž“αž‘αžΆαž‰αž™αž€αŸ” αžœαžΆαž”αŸ’αžšαŸ‚αžαžΆαž…αŸ’αžšαžΎαž“αž‡αžΆαž„αž–αžΆαž€αŸ‹αž€αžŽαŸ’αžαžΆαž› (228) αž“αŸƒαž’αžαŸ’αžαž”αž‘αž“αŸ…αž›αžΎ Habre αžαŸ’αžšαžΌαžœαž”αžΆαž“αž›αžΆαž€αŸ‹αž¬αž›αž»αž”αŸ”
  • αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž‘αžΆαŸ†αž„αž˜αžΌαž›αžŠαŸ‚αž›αž˜αžΆαž“αž’αžαŸ’αžαž”αž‘αž‡αž·αžαž€αž“αŸ’αž›αŸ‡αž›αžΆαž“αž˜αžΆαž“αž‘αž˜αŸ’αž„αž“αŸ‹ 2.95 GB αŸ” αž“αŸ…αž€αŸ’αž“αž»αž„αžŸαŸ†αžŽαž»αŸ†αž”αŸ‚αž”αž”αž‘αžŠαŸ‚αž›αž”αžΆαž“αž”αž„αŸ’αž αžΆαž”αŸ‹ - 495 αž˜αŸαž€αžΆαž”αŸƒαŸ”
  • αžŸαžšαž»αž”αž˜αž€ αž˜αžΆαž“αž’αŸ’αž“αž€αž“αž·αž–αž“αŸ’αž’αž…αŸ†αž“αž½αž“ ៣៧.៨០ៀ αž“αžΆαž€αŸ‹αž“αŸ…αž›αžΎ Habre αŸ” αžαŸ’αž‰αž»αŸ†αžŸαžΌαž˜αžšαŸ†αž›αžΉαž€αž’αŸ’αž“αž€αžαžΆ αž‘αžΆαŸ†αž„αž“αŸαŸ‡αž‚αŸ’αžšαžΆαž“αŸ‹αžαŸ‚αž‡αžΆαžŸαŸ’αžαž·αžαž·αž–αžΈαž€αžΆαžšαž•αŸ’αžŸαžΆαž™αž”αž“αŸ’αžαž•αŸ’αž‘αžΆαž›αŸ‹αž”αŸ‰αž»αžŽαŸ’αžŽαŸ„αŸ‡αŸ”
  • αž’αŸ’αž“αž€αž“αž·αž–αž“αŸ’αž’αžŠαŸ‚αž›αž˜αžΆαž“αž•αž›αž·αžαž—αžΆαž–αž”αŸ†αž•αž»αžαž“αŸ…αž›αžΎ Habre - αž’αžΆαž›αžΈαž αŸ’αžŸαžΆ - ៨៧៧ៀ αž’αžαŸ’αžαž”αž‘αŸ”
  • αž’αžαŸ’αžαž”αž‘β€‹αžŠαŸ‚αž›β€‹αž”αžΆαž“β€‹αžœαžΆαž™β€‹αžαž˜αŸ’αž›αŸƒβ€‹αžαŸ’αž–αžŸαŸ‹ - 1448 αž”αžΌαž€
  • αž’αžαŸ’αžαž”αž‘αžŠαŸ‚αž›αž’αžΆαž“αž—αžΆαž‚αž…αŸ’αžšαžΎαž“ - 1660841 αž‘αžŸαŸ’αžŸαž“αŸˆ
  • αž—αžΆαž‚αž…αŸ’αžšαžΎαž“αž“αž·αž™αžΆαž™αž’αŸ†αž–αžΈαž’αžαŸ’αžαž”αž‘ - 2444 αž™αŸ„αž”αž›αŸ‹

αž‡αžΆαž€αžΆαžšαž”αŸ’αžšαžŸαžΎαžšαžŽαžΆαžŸαŸ‹, αž“αŸ…αž€αŸ’αž“αž»αž„αžŸαŸ†αžŽαž»αŸ†αž”αŸ‚αž”αž”αž‘αž“αŸƒαž€αŸ†αž–αžΌαž›αž’αŸ’αž“αž€αž“αž·αž–αž“αŸ’αž’ 15 αž€αŸ†αž–αžΌαž›Habr αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αž“αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž˜αž½αž™αŸ”
αž…αŸ†αžŽαžΆαžαŸ‹αžαŸ’αž“αžΆαž€αŸ‹αž€αŸ†αž–αžΌαž›αž‘αžΆαŸ†αž„ 15 αžαžΆαž˜αž…αŸ†αžŽαžΆαžαŸ‹αžαŸ’αž“αžΆαž€αŸ‹Habr αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αž“αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž˜αž½αž™αŸ”
αž€αŸ†αž–αžΌαž› 15 αž’αžΆαž“Habr αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αž“αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž˜αž½αž™αŸ”
αž€αŸ†αž–αžΌαž› 15 αž”αžΆαž“αž–αž·αž—αžΆαž€αŸ’αžŸαžΆHabr αž‘αžΆαŸ†αž„αž’αžŸαŸ‹αž“αŸ…αž€αŸ’αž“αž»αž„αž˜αžΌαž›αžŠαŸ’αž‹αžΆαž“αž‘αž·αž“αŸ’αž“αž“αŸαž™αž˜αž½αž™αŸ”

αž”αŸ’αžšαž—αž–: www.habr.com

αž”αž“αŸ’αžαŸ‚αž˜αž˜αžαž·αž™αŸ„αž”αž›αŸ‹