ื›ืœ Habr ื‘ืžืกื“ ื ืชื•ื ื™ื ืื—ื“

ืื—ืจ ื”ืฆื”ืจื™ื™ื ื˜ื•ื‘ื™ื. ืขื‘ืจื• ืฉื ืชื™ื™ื ืžืื– ืฉื”ื•ื ื ื›ืชื‘. ืžืืžืจ ืื—ืจื•ืŸ ืœื’ื‘ื™ ื ื™ืชื•ื— Habr, ื•ื›ืžื” ื ืงื•ื“ื•ืช ื”ืฉืชื ื•.

ื›ืฉืจืฆื™ืชื™ ืœืงื‘ืœ ืขื•ืชืง ืฉืœ Habr, ื”ื—ืœื˜ืชื™ ืœื›ืชื•ื‘ ืžื ืชื— ืฉื™ืฉืžื•ืจ ืืช ื›ืœ ื”ืชื•ื›ืŸ ืฉืœ ื”ืžื—ื‘ืจื™ื ืœืžืื’ืจ. ืื™ืš ื–ื” ืงืจื” ื•ื‘ืื™ืœื• ืฉื’ื™ืื•ืช ื ืชืงืœืชื™ - ืชื•ื›ืœื• ืœืงืจื•ื ืžืชื—ืช ืœื—ืชื•ืš.

TLDR- ืงื™ืฉื•ืจ ืœืžืกื“ ื”ื ืชื•ื ื™ื

ื”ื’ืจืกื” ื”ืจืืฉื•ื ื” ืฉืœ ื”ืžื ืชื—. ืฉืจืฉื•ืจ ืื—ื“, ื”ืจื‘ื” ื‘ืขื™ื•ืช

ืžืœื›ืชื—ื™ืœื”, ื”ื—ืœื˜ืชื™ ืœื”ื›ื™ืŸ ืื‘ ื˜ื™ืคื•ืก ืฉืœ ืกืงืจื™ืคื˜ ื‘ื• ื™ื ื•ืชื— ื”ืžืืžืจ ื•ื™ื™ื›ื ืก ืœืžืกื“ ื”ื ืชื•ื ื™ื ืžื™ื“ ืขื ื”ื”ื•ืจื“ื”. ื‘ืœื™ ืœื—ืฉื•ื‘ ืคืขืžื™ื™ื, ื”ืฉืชืžืฉืชื™ ื‘-sqlite3, ื›ื™. ื–ื” ื”ื™ื” ืคื—ื•ืช ืขืชื™ืจ ืขื‘ื•ื“ื”: ืื™ืŸ ืฆื•ืจืš ื‘ืฉืจืช ืžืงื•ืžื™, ื ื•ืฆืจ-ื ืจืื”-ื ืžื—ืง ื•ื›ืืœื”.

one_thread.py

from bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime

def main(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content  TEXT, tags TEXT)")

    start_time = datetime.now()
    c.execute("begin")
    for i in range(min, max):
        url = "https://m.habr.com/post/{}".format(i)
        try:
            r = requests.get(url)
        except:
            with open("req_errors.txt") as file:
                file.write(i)
            continue
        if(r.status_code != 200):
            print("{} - {}".format(i, r.status_code))
            continue

        html_doc = r.text
        soup = BeautifulSoup(html_doc, 'html.parser')

        try:
            author = soup.find(class_="tm-user-info__username").get_text()
            content = soup.find(id="post-content-body")
            content = str(content)
            title = soup.find(class_="tm-article-title__text").get_text()
            tags = soup.find(class_="tm-article__tags").get_text()
            tags = tags[5:]
        except:
            author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
            content = "ะŸั€ะธ ะฟะฐั€ัะธะฝะณะต ัั‚ะพะน ัั‚ั€ะฐะฝะธั†ะต ะฟั€ะพะธะทะพัˆะปะฐ ะพัˆะธะฑะบะฐ."

        c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
        print(i)
    c.execute("commit")
    print(datetime.now() - start_time)

main(1, 490406)

ื”ื›ืœ ืงืœืืกื™ - ืื ื—ื ื• ืžืฉืชืžืฉื™ื ื‘ืžืจืง ื™ืคื”, ื‘ืงืฉื•ืช ื•ืื‘ ื˜ื™ืคื•ืก ืžื”ื™ืจ ืžื•ื›ืŸ. ื–ื” ืคืฉื•ื˜โ€ฆ

  • ื”ื•ืจื“ืช ื”ืขืžื•ื“ ื ืžืฆืืช ื‘ืฉืจืฉื•ืจ ืื—ื“

  • ืื ืชืคืจื™ืข ืœื‘ื™ืฆื•ืข ื”ืกืงืจื™ืคื˜, ื›ืœ ืžืกื“ ื”ื ืชื•ื ื™ื ืœื ื™ืœืš ืœืฉื•ื ืžืงื•ื. ืื—ืจื™ ื”ื›ืœ, ื”-commit ืžืชื‘ืฆืข ืจืง ืœืื—ืจ ื›ืœ ื”ื ื™ืชื•ื—.
    ื›ืžื•ื‘ืŸ ืฉื ื™ืชืŸ ืœื‘ืฆืข ืฉื™ื ื•ื™ื™ื ื‘ื‘ืกื™ืก ื”ื ืชื•ื ื™ื ืœืื—ืจ ื›ืœ ื”ื›ื ืกื”, ืื‘ืœ ืื– ื–ืžืŸ ื‘ื™ืฆื•ืข ื”ืกืงืจื™ืคื˜ ื™ื’ื“ืœ ืžืฉืžืขื•ืชื™ืช.

  • ื ื™ืชื•ื— 100 ื”ืžืืžืจื™ื ื”ืจืืฉื•ื ื™ื ืœืงื— ืœื™ 000 ืฉืขื•ืช.

ื‘ืฉืœื‘ ื”ื‘ื ืื ื™ ืžื•ืฆื ืืช ื”ืžืืžืจ ืฉืœ ื”ืžืฉืชืžืฉ ืžืฉื•ืœื‘ืช, ืฉืงืจืืชื™ ื•ืžืฆืืชื™ ื›ืžื” ืคืจื™ืฆื•ืช ืœื—ื™ื™ื ื›ื“ื™ ืœื”ืื™ืฅ ืืช ื”ืชื”ืœื™ืš ื”ื–ื”:

  • ื”ืฉื™ืžื•ืฉ ื‘-multithreading ืžืื™ืฅ ืืช ื”ื”ื•ืจื“ื” ืœืคืขืžื™ื.
  • ืืชื” ืœื ื™ื›ื•ืœ ืœืงื‘ืœ ืืช ื”ื’ืจืกื” ื”ืžืœืื” ืฉืœ ื”-Habri, ืืœื ืืช ื”ื’ืจืกื” ื”ื ื™ื™ื“ืช ืฉืœื•.
    ืœื“ื•ื’ืžื”, ืื ืžืืžืจ ืžืฉื•ืœื‘ ื‘ื’ืจืกืช ืฉื•ืœื—ืŸ ื”ืขื‘ื•ื“ื” ืฉื•ืงืœ 378 ืงื™ืœื•-ื‘ื™ื™ื˜, ืื– ื‘ื’ืจืกื” ื”ื ื™ื™ื“ืช ื”ื•ื ื›ื‘ืจ 126 ืงื™ืœื•-ื‘ื™ื™ื˜.

ื’ืจืกื” ืฉื ื™ื”. ื—ื•ื˜ื™ื ืจื‘ื™ื, ืื™ืกื•ืจ ื–ืžื ื™ ืžื”ื‘ืจ

ื›ืฉื—ื˜ื˜ืชื™ ื‘ืื™ื ื˜ืจื ื˜ ื‘ื ื•ืฉื ืฉืœ ืจื™ื‘ื•ื™ ืฉืจืฉื•ืจื™ื ื‘ืคื™ืชื•ืŸ, ื‘ื—ืจืชื™ ื‘ืืคืฉืจื•ืช ื”ืคืฉื•ื˜ื” ื‘ื™ื•ืชืจ ืขื multiprocessing.dummy, ืฉืžืชื™ ืœื‘ ืฉื”ื•ืคื™ืขื• ื‘ืขื™ื•ืช ื™ื—ื“ ืขื ืจื™ื‘ื•ื™ ื”ืฉืจืฉื•ืจื™ื.

SQLite3 ืœื ืจื•ืฆื” ืœืขื‘ื•ื“ ืขื ื™ื•ืชืจ ืžืฉืจืฉื•ืจ ืื—ื“.
ืชื•ืงืŸ check_same_thread=False, ืื‘ืœ ื”ืฉื’ื™ืื” ื”ื–ื• ื”ื™ื ืœื ื”ื™ื—ื™ื“ื”, ื›ืืฉืจ ืžื ืกื™ื ืœื”ื›ื ื™ืก ืœืžืกื“ ื”ื ืชื•ื ื™ื, ืœืคืขืžื™ื ืžืชืจื—ืฉื•ืช ืฉื’ื™ืื•ืช ืฉืœื ื”ืฆืœื—ืชื™ ืœืคืชื•ืจ.

ืœื›ืŸ, ืื ื™ ืžื—ืœื™ื˜ ืœื ื˜ื•ืฉ ืืช ื”ื”ื›ื ืกื” ื”ืžื™ื™ื“ื™ืช ืฉืœ ืžืืžืจื™ื ื™ืฉื™ืจื•ืช ืœืžืกื“ ื”ื ืชื•ื ื™ื, ื•ื‘ื–ื›ืจืชื™ ืืช ื”ืคืชืจื•ืŸ ื”ืžืฉื•ืœื‘, ืื ื™ ืžื—ืœื™ื˜ ืœื”ืฉืชืžืฉ ื‘ืงื‘ืฆื™ื, ื›ื™ ืื™ืŸ ื‘ืขื™ื•ืช ืขื ื›ืชื™ื‘ื” ืžืจื•ื‘ื” ื—ื•ื˜ื™ื ืœืงื•ื‘ืฅ.

Habr ืžืชื—ื™ืœ ืœืืกื•ืจ ืขืœ ืฉื™ืžื•ืฉ ื‘ื™ื•ืชืจ ืžืฉืœื•ืฉื” ืฉืจืฉื•ืจื™ื.
ื ื™ืกื™ื•ื ื•ืช ื ืœื”ื‘ื™ื ื‘ืžื™ื•ื—ื“ ืœืขื‘ื•ืจ ืœื”ื‘ืจ ืขืœื•ืœื™ื ืœื”ืกืชื™ื™ื ื‘ืื™ืกื•ืจ ip ืœืžืฉืš ื›ืžื” ืฉืขื•ืช. ืื– ืืชื” ืฆืจื™ืš ืœื”ืฉืชืžืฉ ืจืง ื‘-3 ืฉืจืฉื•ืจื™ื, ืื‘ืœ ื–ื” ื›ื‘ืจ ื˜ื•ื‘, ืžื›ื™ื•ื•ืŸ ืฉื”ื–ืžืŸ ืœื—ื–ื•ืจ ืขืœ 100 ืžืืžืจื™ื ืžืฆื˜ืžืฆื ืž-26 ืœ-12 ืฉื ื™ื•ืช.

ืจืื•ื™ ืœืฆื™ื™ืŸ ื›ื™ ื’ืจืกื” ื–ื• ืื™ื ื” ื™ืฆื™ื‘ื” ืœืžื“ื™, ื•ื”ื”ื•ืจื“ื•ืช ื ื•ืฉืจื•ืช ืžืขืช ืœืขืช ืขืœ ืžืกืคืจ ืจื‘ ืฉืœ ืžืืžืจื™ื.

async_v1.py

from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/post/{}".format(i)

    try: r = requests.get(url)
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    # ะ—ะฐะฟะธััŒ ะทะฐะฑะปะพะบะธั€ะพะฒะฐะฝะฝั‹ั… ะทะฐะฟั€ะพัะพะฒ ะฝะฐ ัะตั€ะฒะตั€
    if (r.status_code == 503):
        with open("Error503.txt", "a") as write_file:
            write_file.write(str(i) + "n")
            logging.warning('{} / 503 Error'.format(i))

    # ะ•ัะปะธ ะฟะพัั‚ะฐ ะฝะต ััƒั‰ะตัั‚ะฒัƒะตั‚ ะธะปะธ ะพะฝ ะฑั‹ะป ัะบั€ั‹ั‚
    if (r.status_code != 200):
        logging.info("{} / {} Code".format(i, r.status_code))
        return r.status_code

    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html5lib')

    try:
        author = soup.find(class_="tm-user-info__username").get_text()

        timestamp = soup.find(class_='tm-user-meta__date')
        timestamp = timestamp['title']

        content = soup.find(id="post-content-body")
        content = str(content)
        title = soup.find(class_="tm-article-title__text").get_text()
        tags = soup.find(class_="tm-article__tags").get_text()
        tags = tags[5:]

        # ะœะตั‚ะบะฐ, ั‡ั‚ะพ ะฟะพัั‚ ัะฒะปัะตั‚ัั ะฟะตั€ะตะฒะพะดะพะผ ะธะปะธ ั‚ัƒั‚ะพั€ะธะฐะปะพะผ.
        tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()

        rating = soup.find(class_="tm-votes-score").get_text()
    except:
        author = title = tags = timestamp = tm_tag = rating = "Error" 
        content = "ะŸั€ะธ ะฟะฐั€ัะธะฝะณะต ัั‚ะพะน ัั‚ั€ะฐะฝะธั†ะต ะฟั€ะพะธะทะพัˆะปะฐ ะพัˆะธะฑะบะฐ."
        logging.warning("Error parsing - {}".format(i))
        with open("Errors.txt", "a") as write_file:
            write_file.write(str(i) + "n")

    # ะ—ะฐะฟะธัั‹ะฒะฐะตะผ ัั‚ะฐั‚ัŒัŽ ะฒ json
    try:
        article = [i, timestamp, author, title, content, tm_tag, rating, tags]
        with open(currentFile, "w") as write_file:
            json.dump(article, write_file)
    except:
        print(i)
        raise

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("ะะตะพะฑั…ะพะดะธะผั‹ ะฟะฐั€ะฐะผะตั‚ั€ั‹ min ะธ max. ะ˜ัะฟะพะปัŒะทะพะฒะฐะฝะธะต: async_v1.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    # ะ•ัะปะธ ะฟะพั‚ะพะบะพะฒ >3
    # ั‚ะพ ั…ะฐะฑั€ ะฑะฐะฝะธั‚ ipัˆะฝะธะบ ะฝะฐ ะฒั€ะตะผั
    pool = ThreadPool(3)

    # ะžั‚ัั‡ะตั‚ ะฒั€ะตะผะตะฝะธ, ะทะฐะฟัƒัะบ ะฟะพั‚ะพะบะพะฒ
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    # ะŸะพัะปะต ะทะฐะบั€ั‹ั‚ะธั ะฒัะตั… ะฟะพั‚ะพะบะพะฒ ะฟะตั‡ะฐั‚ะฐะตะผ ะฒั€ะตะผั
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

ื’ืจืกื” ืฉืœื™ืฉื™ืช. ืกื•ืคื™

ืชื•ืš ื›ื“ื™ ื ื™ืคื•ื™ ื‘ืื’ื™ื ื‘ื’ืจืกื” ื”ืฉื ื™ื™ื”, ื’ื™ืœื™ืชื™ ืฉืœ-Habr, ืคืชืื•ื, ื™ืฉ API ืฉื”ื’ืจืกื” ื”ื ื™ื™ื“ืช ืฉืœ ื”ืืชืจ ื ื™ื’ืฉืช ืืœื™ื•. ื”ื•ื ื ื˜ืขืŸ ืžื”ืจ ื™ื•ืชืจ ืžื”ื’ืจืกื” ื”ื ื™ื™ื“ืช, ืžื›ื™ื•ื•ืŸ ืฉื–ื” ืจืง json, ืฉืืคื™ืœื• ืœื ืฆืจื™ืš ืœื ืชื—. ื‘ืกื•ืคื• ืฉืœ ื“ื‘ืจ, ื”ื—ืœื˜ืชื™ ืœืฉื›ืชื‘ ืืช ื”ืชืกืจื™ื˜ ืฉืœื™ ืฉื•ื‘.

ืื–, ืœืื—ืจ ืฉืžืฆืืชื™ ืงื™ืฉื•ืจ ื–ื” API, ืืชื” ื™ื›ื•ืœ ืœื”ืชื—ื™ืœ ืœื ืชื— ืื•ืชื•.

async_v2.py

import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging

def worker(i):
    currentFile = "files\{}.json".format(i)

    if os.path.isfile(currentFile):
        logging.info("{} - File exists".format(i))
        return 1

    url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)

    try:
        r = requests.get(url)
        if r.status_code == 503:
            logging.critical("503 Error")
            return 503
    except:
        with open("req_errors.txt") as file:
            file.write(i)
        return 2

    data = json.loads(r.text)

    if data['success']:
        article = data['data']['article']

        id = article['id']
        is_tutorial = article['is_tutorial']
        time_published = article['time_published']
        comments_count = article['comments_count']
        lang = article['lang']
        tags_string = article['tags_string']
        title = article['title']
        content = article['text_html']
        reading_count = article['reading_count']
        author = article['author']['login']
        score = article['voting']['score']

        data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
        with open(currentFile, "w") as write_file:
            json.dump(data, write_file)

if __name__ == '__main__':
    if len(sys.argv) < 3:
        print("ะะตะพะฑั…ะพะดะธะผั‹ ะฟะฐั€ะฐะผะตั‚ั€ั‹ min ะธ max. ะ˜ัะฟะพะปัŒะทะพะฒะฐะฝะธะต: asyc.py 1 100")
        sys.exit(1)
    min = int(sys.argv[1])
    max = int(sys.argv[2])

    # ะ•ัะปะธ ะฟะพั‚ะพะบะพะฒ >3
    # ั‚ะพ ั…ะฐะฑั€ ะฑะฐะฝะธั‚ ipัˆะฝะธะบ ะฝะฐ ะฒั€ะตะผั
    pool = ThreadPool(3)

    # ะžั‚ัั‡ะตั‚ ะฒั€ะตะผะตะฝะธ, ะทะฐะฟัƒัะบ ะฟะพั‚ะพะบะพะฒ
    start_time = datetime.now()
    results = pool.map(worker, range(min, max))

    # ะŸะพัะปะต ะทะฐะบั€ั‹ั‚ะธั ะฒัะตั… ะฟะพั‚ะพะบะพะฒ ะฟะตั‡ะฐั‚ะฐะตะผ ะฒั€ะตะผั
    pool.close()
    pool.join()
    print(datetime.now() - start_time)

ื”ื•ื ืžื›ื™ืœ ืฉื“ื•ืช ื”ืงืฉื•ืจื™ื ื”ืŸ ืœืžืืžืจ ืขืฆืžื• ื•ื”ืŸ ืœืžื—ื‘ืจ ืฉื›ืชื‘ ืื•ืชื•.

API.png

ื›ืœ Habr ื‘ืžืกื“ ื ืชื•ื ื™ื ืื—ื“

ืœื ื–ืจืงืชื™ ืืช ื”-json ื”ืžืœื ืฉืœ ื›ืœ ืžืืžืจ, ืืœื ืฉืžืจืชื™ ืจืง ืืช ื”ืฉื“ื•ืช ืฉื”ื™ื™ืชื™ ืฆืจื™ืš:

  • id
  • is_tutorial
  • ื–ืžืŸ_ืคื•ืจืกื
  • ื›ื•ืชืจืช
  • ืชื•ื›ืŸ
  • ืกืคื™ืจืช_ืชื’ื•ื‘ื•ืช
  • lang ื”ื™ื ื”ืฉืคื” ื‘ื” ื ื›ืชื‘ ื”ืžืืžืจ. ืขื“ ื›ื”, ื™ืฉ ืœื• ืจืง en ื•-ru.
  • tags_string - ื›ืœ ื”ืชื’ื™ื ืžื”ืคื•ืกื˜
  • ืกืคื™ืจืช_ืงืจื™ืื”
  • ืžื—ื‘ืจ
  • ืฆื™ื•ืŸ - ื“ื™ืจื•ื’ ืžืืžืจ.

ืœืคื™ื›ืš, ื‘ืืžืฆืขื•ืช ื”-API, ืฆืžืฆืžืชื™ ืืช ื–ืžืŸ ื‘ื™ืฆื•ืข ื”ืกืงืจื™ืคื˜ ืœ-8 ืฉื ื™ื•ืช ืœื›ืœ 100 url.

ืœืื—ืจ ืฉื”ื•ืจื“ื ื• ืืช ื”ื ืชื•ื ื™ื ืฉืื ื• ืฆืจื™ื›ื™ื, ืขืœื™ื ื• ืœืขื‘ื“ ืื•ืชื ื•ืœื”ื–ื™ืŸ ืื•ืชื ืœืžืกื“ ื”ื ืชื•ื ื™ื. ื’ื ืขื ื–ื” ืœื ื”ื™ื• ืœื™ ื‘ืขื™ื•ืช:

parser.py

import json
import sqlite3
import logging
from datetime import datetime

def parser(min, max):
    conn = sqlite3.connect('habr.db')
    c = conn.cursor()
    c.execute('PRAGMA encoding = "UTF-8"')
    c.execute('PRAGMA synchronous = 0') # ะžั‚ะบะปัŽั‡ะฐะตะผ ะฟะพะดั‚ะฒะตั€ะถะดะตะฝะธะต ะทะฐะฟะธัะธ, ั‚ะฐะบ ัะบะพั€ะพัั‚ัŒ ัƒะฒะตะปะธั‡ะธะฒะฐะตั‚ัั ะฒ ั€ะฐะทั‹.
    c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT, 
    lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
    try:
        for i in range(min, max):
            try:
                filename = "files\{}.json".format(i)
                f = open(filename)
                data = json.load(f)

                (id, is_tutorial, time_published, title, content, comments_count, lang,
                 tags_string, reading_count, author, score) = data

                # ะ ะฐะดะธ ะปัƒั‡ัˆะตะน ั‡ะธั‚ะฐะตะผะพัั‚ะธ ะฑะฐะทั‹ ะผะพะถะฝะพ ะฟั€ะตะฝะตะฑั€ะตั‡ัŒ ั‡ะธั‚ะฐะตะผะพัั‚ัŒัŽ ะบะพะดะฐ. ะ˜ะปะธ ะฝะตั‚?
                # ะ•ัะปะธ ะฒะฐะผ ั‚ะฐะบ ะบะฐะถะตั‚ัั, ะผะพะถะฝะพ ะฟั€ะพัั‚ะพ ะทะฐะผะตะฝะธั‚ัŒ ะบะพั€ั‚ะตะถ ะฐั€ะณัƒะผะตะฝั‚ะพะผ data. ะ ะตัˆะฐั‚ัŒ ะฒะฐะผ.

                c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
                                                                                        title, content, lang,
                                                                                        comments_count, reading_count,
                                                                                        score, is_tutorial,
                                                                                        tags_string))
                f.close()

            except IOError:
                logging.info('FileNotExists')
                continue

    finally:
        conn.commit()

start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)

ืกื˜ื˜ื™ืกื˜ื™ืงื”

ื•ื‘ื›ืŸ, ื‘ืื•ืคืŸ ืžืกื•ืจืชื™, ืœื‘ืกื•ืฃ, ืืชื” ื™ื›ื•ืœ ืœื—ืœืฅ ื›ืžื” ื ืชื•ื ื™ื ืกื˜ื˜ื™ืกื˜ื™ื™ื ืžื”ื ืชื•ื ื™ื:

  • ืžืชื•ืš 490 ื”ื”ื•ืจื“ื•ืช ื”ืฆืคื•ื™ื•ืช, ื”ื•ืจื“ื• ืจืง 406 ืžืืžืจื™ื. ืžืกืชื‘ืจ ืฉื™ื•ืชืจ ืžืžื—ืฆื™ืช (228) ืžื”ืžืืžืจื™ื ืขืœ Habrรฉ ื”ื•ืกืชืจื• ืื• ื ืžื—ืงื•.
  • ื”ืžืื’ืจ ื›ื•ืœื•, ื”ืžื•ืจื›ื‘ ืžื›ืžืขื˜ ื—ืฆื™ ืžื™ืœื™ื•ืŸ ืžืืžืจื™ื, ืฉื•ืงืœ 2.95 ื’'ื™ื’ื”-ื‘ื™ื™ื˜. ื‘ืฆื•ืจื” ื“ื—ื•ืกื” - 495 ืžื’ื”.
  • ื‘ืกืš ื”ื›ืœ, 37804 ืื ืฉื™ื ื”ื ื”ืžื—ื‘ืจื™ื ืฉืœ Habrรฉ. ืื ื™ ืžื–ื›ื™ืจ ืœืš ืฉื”ืกื˜ื˜ื™ืกื˜ื™ืงื” ื”ื–ื• ื”ื™ื ืจืง ืžืคื•ืกื˜ื™ื ื—ื™ื™ื.
  • ื”ืžื—ื‘ืจ ื”ื›ื™ ืคืจื•ื“ื•ืงื˜ื™ื‘ื™ ืขืœ Habrรฉ - ืืœื™ื–ืจ - 8774 ืžืืžืจื™ื.
  • ืžืืžืจ ื‘ืขืœ ื“ื™ืจื•ื’ ื’ื‘ื•ื” - 1448 ืคืœื•ืกื™ื
  • ื”ืžืืžืจ ื”ื ืงืจื ื‘ื™ื•ืชืจ โ€“ 1660841 ืฆืคื™ื•ืช
  • ื”ืžืืžืจ ื”ื ื“ื•ืŸ ื‘ื™ื•ืชืจ - 2444 ื”ืขืจื•ืช

ื•ื‘ื›ืŸ, ื‘ืฆื•ืจื” ืฉืœ ืฆืžืจื•ืช15 ื”ืกื•ืคืจื™ื ื”ืžื•ื‘ื™ืœื™ืื›ืœ Habr ื‘ืžืกื“ ื ืชื•ื ื™ื ืื—ื“
15 ื”ืžื•ื‘ื™ืœื™ื ืœืคื™ ื“ื™ืจื•ื’ื›ืœ Habr ื‘ืžืกื“ ื ืชื•ื ื™ื ืื—ื“
15 ื”ืงืจื™ืื” ื”ืžื•ื‘ื™ืœื”ื›ืœ Habr ื‘ืžืกื“ ื ืชื•ื ื™ื ืื—ื“
15 ื”ืžื•ื‘ื™ืœื™ื ืฉื ื™ื“ื•ื ื•ื›ืœ Habr ื‘ืžืกื“ ื ืชื•ื ื™ื ืื—ื“

ืžืงื•ืจ: www.habr.com

ื”ื•ืกืคืช ืชื’ื•ื‘ื”