α’αα»ααα½ααααΈα ααΆααΆααααααα 2 ααααΆαα αΎαα
αΆααααΆααααΈαααα»ααααααααΆα
αα αααααααααα»αα ααααΆαα αααΆααα αααααα Habr αααα»αααΆααααααα α α·αααααααα parser αααααΉααααααΆαα»αααΆαα·ααΆααΆααα’ααααααα’ααααα·αααααα αααα»αααΌαααααΆααα·ααααααα αααααααααΆααΎαα‘αΎααα·αααα α»αα’αααΈααααααα»αααΆααα½αααααα - α’αααα’αΆα α’αΆααα αααααααΆαααΆααα
TLDR-
ααααααααΌαααα§ααααααααα αααααααα½ααααα αΆααΆα αααΎαα
ααΎααααΈα αΆααααααΎα αααα»αααΆααααααα α α·ααααααααΎαααααΌααΎαααααααααΈα αααααααΆαααααααΆααααΈααΆαααΆααα α’αααααααΉαααααΌαααΆαααα α αΎαααΆαααααα»αααΌαααααΆααα·ααααααα ααααα·ααα·αααΈααα αααα»αααααΎ sqlite3 ααααα... ααΆαα·αααΌαααααΎαααααΆααααααααααα α’ααααα·αα αΆαααΆα αααΆααααΆαααΈααααααα»ααααα»α αααααΎα ααΎα αα»α αα·αα’αααΈαααΌα αααααα
one_thread.py
from bs4 import BeautifulSoup
import sqlite3
import requests
from datetime import datetime
def main(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute("CREATE TABLE IF NOT EXISTS habr(id INT, author VARCHAR(255), title VARCHAR(255), content TEXT, tags TEXT)")
start_time = datetime.now()
c.execute("begin")
for i in range(min, max):
url = "https://m.habr.com/post/{}".format(i)
try:
r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
continue
if(r.status_code != 200):
print("{} - {}".format(i, r.status_code))
continue
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')
try:
author = soup.find(class_="tm-user-info__username").get_text()
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
except:
author,title,tags = "Error", "Error {}".format(r.status_code), "Error"
content = "ΠΡΠΈ ΠΏΠ°ΡΡΠΈΠ½Π³Π΅ ΡΡΠΎΠΉ ΡΡΡΠ°Π½ΠΈΡΠ΅ ΠΏΡΠΎΠΈΠ·ΠΎΡΠ»Π° ΠΎΡΠΈΠ±ΠΊΠ°."
c.execute('INSERT INTO habr VALUES (?, ?, ?, ?, ?)', (i, author, title, content, tags))
print(i)
c.execute("commit")
print(datetime.now() - start_time)
main(1, 490406)
α’αααΈααααααααΆαααΊααααα ααΆααα»ααΆα - ααΎαααααΎααα»ααααααααααα’αΆαααααΎαα·αααααΌαα ααααΊαα½α ααΆααα αααααααΆααααααΆ...
-
αααααααααΌαααΆαααΆααααααα»ααααααααα½α
-
ααααα·αααΎα’αααααααΆαααΆαααααα·ααααα·ααααααααΈα αααααΌαααααΆααα·ααααααααΆααααΌαααΉααα·ααα ααΆααα αααααΆααααΈααΆααα’αα ααΆαααααααααΆα α·αααααααΌαααΆαααααα·ααααα·αααααααΆααααΈααΆααα·ααΆαααΆααα’ααα
ααΆβααΆαβαα·αβααΆαα α’αααβα’αΆα βααααΎβααΆαβααααΆααβααααΌβαβαα βααΌαααααΆαβαα·ααααααβαααααΆααβααΈβααΆαβαααα αΌαβααααΆ ααα»ααααβαααααΆααβααβαααβααααΆβααααα·ααααα·βααααααΈαβααΉαβααΎαβα‘αΎαβαααΆαβααααΆααα -
ααΆααααα’ααααα 100 ααααΌαααΆαα αααΆαααα 000 ααααα
αααααΆαααααααα»αααααΎαα’αααααααααα’αααααααΎ
- ααΆαααααΎααααΆαα multithreading αααααΎαααααΏαααΆαααΆααααααΆαααααΆααα
- α’αααβαα·αβα’αΆα
βααα½αβααααβαααβαααβαααα Habr αα ααα»ααααβααΆβααααβααΌαααααβααααβααΆα
α§ααΆα ααα ααααα·αααΎα’ααααααα½ααααα αΌαααααΆαα αααα»ααααααααααα»ααΆααααααα 378 KB ααααα αααα»αααααα αααααΆααΆα 126 KB αα½α α αΎαα
ααααααΈααΈαα αααααααα‘αΆαααΆα αααΎα ααΆαα αΆαααΆααααααααα’αΆααααααΈ Habr
αα αααααααααα»αααΆαααΎαα’ααΈαααΊαα·αααΎαααααΆααααα multithreading αα αααα»α python α αΎαααααΎαααΎααααααΎαααααΆαααααααα»αααΆαα½α multiprocessing.dummy αααα»αααΆαααααααααΆααααΆαααα αΆααΆαααα α‘αΎααα½αααΆαα½α multithreading α
SQLite3 αα·αα
ααααααΎααΆαααΆαα½ααααααααα‘αΆαα
αααΎαααΆααα½αα.
αα½ααα»α check_same_thread=False
ααα»ααααααα α»αααααα·αααααααα½ααα αα
ααααααααααΆααΆααααα
αΌααα
αααα»αααΌαααααΆααα·αααααα αααααααααα α»αααΎαα‘αΎαααααααα»ααα·αα’αΆα
αααααααΆαααΆαα
ααΌα ααααα αΎα αααα»ααααααα α α·αααααααααααΆααααα αΌαα’αααααααααΆαααα αααα»αααΌαααααΆααα·αααααα α αΎααααααΉαααΎααααααααααΆααα½ααααα αΌαααααΆ αααα»ααααααα α α·αααααααΎα―αααΆα ααααααα·αααΆααααα αΆααΆαα½αααΆααααααα αααΎααααααα α―αααΆαααα
Habr α
αΆααααααΎαα αΆαααααΆααααααΆααααΆαααααΎααααααΎαααΈααΈ.
ααΆαααααΆααΆααααα§ααααΆα αααααΆααΆαααΆαα·αααααΎααααΈαα
ααα Habr α’αΆα
αααααΆαα±ααααΆαααΆαα αΆαααΆαα IP αααααΆααααΈαααΈααααα ααΌα
ααααα’αααααααΌαααααΎαα 3 αααα ααα»αααααααααΊααα’αα½α
αα
α αΎα α
αΆααααΆααααΈαααααΎααααΈααααααααΆα 100 α’αααααααααΌαααΆαααΆαααααααααΈ 26 αα
12 αα·ααΆααΈα
ααΆαα½αα±ααααααααααΆααααΆαααααααααΊαα·ααααα·ααααααααα αΎαααΆαααΆαααααΆαααααΆαααααΆαααααΎα’ααααααα½αα ααα½αααα
async_v1.py
from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/post/{}".format(i)
try: r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
# ΠΠ°ΠΏΠΈΡΡ Π·Π°Π±Π»ΠΎΠΊΠΈΡΠΎΠ²Π°Π½Π½ΡΡ
Π·Π°ΠΏΡΠΎΡΠΎΠ² Π½Π° ΡΠ΅ΡΠ²Π΅Ρ
if (r.status_code == 503):
with open("Error503.txt", "a") as write_file:
write_file.write(str(i) + "n")
logging.warning('{} / 503 Error'.format(i))
# ΠΡΠ»ΠΈ ΠΏΠΎΡΡΠ° Π½Π΅ ΡΡΡΠ΅ΡΡΠ²ΡΠ΅Ρ ΠΈΠ»ΠΈ ΠΎΠ½ Π±ΡΠ» ΡΠΊΡΡΡ
if (r.status_code != 200):
logging.info("{} / {} Code".format(i, r.status_code))
return r.status_code
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html5lib')
try:
author = soup.find(class_="tm-user-info__username").get_text()
timestamp = soup.find(class_='tm-user-meta__date')
timestamp = timestamp['title']
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
# ΠΠ΅ΡΠΊΠ°, ΡΡΠΎ ΠΏΠΎΡΡ ΡΠ²Π»ΡΠ΅ΡΡΡ ΠΏΠ΅ΡΠ΅Π²ΠΎΠ΄ΠΎΠΌ ΠΈΠ»ΠΈ ΡΡΡΠΎΡΠΈΠ°Π»ΠΎΠΌ.
tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()
rating = soup.find(class_="tm-votes-score").get_text()
except:
author = title = tags = timestamp = tm_tag = rating = "Error"
content = "ΠΡΠΈ ΠΏΠ°ΡΡΠΈΠ½Π³Π΅ ΡΡΠΎΠΉ ΡΡΡΠ°Π½ΠΈΡΠ΅ ΠΏΡΠΎΠΈΠ·ΠΎΡΠ»Π° ΠΎΡΠΈΠ±ΠΊΠ°."
logging.warning("Error parsing - {}".format(i))
with open("Errors.txt", "a") as write_file:
write_file.write(str(i) + "n")
# ΠΠ°ΠΏΠΈΡΡΠ²Π°Π΅ΠΌ ΡΡΠ°ΡΡΡ Π² json
try:
article = [i, timestamp, author, title, content, tm_tag, rating, tags]
with open(currentFile, "w") as write_file:
json.dump(article, write_file)
except:
print(i)
raise
if __name__ == '__main__':
if len(sys.argv) < 3:
print("ΠΠ΅ΠΎΠ±Ρ
ΠΎΠ΄ΠΈΠΌΡ ΠΏΠ°ΡΠ°ΠΌΠ΅ΡΡΡ min ΠΈ max. ΠΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°Π½ΠΈΠ΅: async_v1.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
# ΠΡΠ»ΠΈ ΠΏΠΎΡΠΎΠΊΠΎΠ² >3
# ΡΠΎ Ρ
Π°Π±Ρ Π±Π°Π½ΠΈΡ ipΡΠ½ΠΈΠΊ Π½Π° Π²ΡΠ΅ΠΌΡ
pool = ThreadPool(3)
# ΠΡΡΡΠ΅Ρ Π²ΡΠ΅ΠΌΠ΅Π½ΠΈ, Π·Π°ΠΏΡΡΠΊ ΠΏΠΎΡΠΎΠΊΠΎΠ²
start_time = datetime.now()
results = pool.map(worker, range(min, max))
# ΠΠΎΡΠ»Π΅ Π·Π°ΠΊΡΡΡΠΈΡ Π²ΡΠ΅Ρ
ΠΏΠΎΡΠΎΠΊΠΎΠ² ΠΏΠ΅ΡΠ°ΡΠ°Π΅ΠΌ Π²ΡΠ΅ΠΌΡ
pool.close()
pool.join()
print(datetime.now() - start_time)
ααααααΈααΈα α α»αααααα
ααααααααααααα»αααααΆααααα α»αααααααΈααΈα αααα»αααΆαααααΎαααΆ Habr ααααΆααααααΆα API αααααααΌαααΆαα αΌαααααΎαααααααα αααααααααα αααααα ααΆαααα»αααΏαααΆαααααα ααααααααααΆααααΆααααααΆ json ααααα·αα αΆαααΆα αααααΌαααΆααααα αα βααΈβαααα αα αααα»αβααΆαβαααααα βα α·αααβαααααβα’ααααβααααβαααα»αβααααβαααα
ααΌα
αααααααααΆαααααΎα
async_v2.py
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)
try:
r = requests.get(url)
if r.status_code == 503:
logging.critical("503 Error")
return 503
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
data = json.loads(r.text)
if data['success']:
article = data['data']['article']
id = article['id']
is_tutorial = article['is_tutorial']
time_published = article['time_published']
comments_count = article['comments_count']
lang = article['lang']
tags_string = article['tags_string']
title = article['title']
content = article['text_html']
reading_count = article['reading_count']
author = article['author']['login']
score = article['voting']['score']
data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
with open(currentFile, "w") as write_file:
json.dump(data, write_file)
if __name__ == '__main__':
if len(sys.argv) < 3:
print("ΠΠ΅ΠΎΠ±Ρ
ΠΎΠ΄ΠΈΠΌΡ ΠΏΠ°ΡΠ°ΠΌΠ΅ΡΡΡ min ΠΈ max. ΠΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°Π½ΠΈΠ΅: asyc.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
# ΠΡΠ»ΠΈ ΠΏΠΎΡΠΎΠΊΠΎΠ² >3
# ΡΠΎ Ρ
Π°Π±Ρ Π±Π°Π½ΠΈΡ ipΡΠ½ΠΈΠΊ Π½Π° Π²ΡΠ΅ΠΌΡ
pool = ThreadPool(3)
# ΠΡΡΡΠ΅Ρ Π²ΡΠ΅ΠΌΠ΅Π½ΠΈ, Π·Π°ΠΏΡΡΠΊ ΠΏΠΎΡΠΎΠΊΠΎΠ²
start_time = datetime.now()
results = pool.map(worker, range(min, max))
# ΠΠΎΡΠ»Π΅ Π·Π°ΠΊΡΡΡΠΈΡ Π²ΡΠ΅Ρ
ΠΏΠΎΡΠΎΠΊΠΎΠ² ΠΏΠ΅ΡΠ°ΡΠ°Π΅ΠΌ Π²ΡΠ΅ΠΌΡ
pool.close()
pool.join()
print(datetime.now() - start_time)
ααΆααΆαααΆααααααΆααααααΉαααΆααα’ααααααααα½αααΆααααΆαα αα·αα’ααααα·αααααααααΆααααααααΆα
API.png
αααα»ααα·αααΆααααα αα json ααααααααα’αααααααΈαα½αααα ααα»ααααααΆααααααΆαα»αααααΆαααααααα»αααααΌαααΆαα
- id
- ααΊ_ααΆααααααα
- time_αααααααΆα
- ααΆααααααΆαα
- ααΆαα·ααΆ
- ααα·ααααα_ααΆαα
- lang ααΊααΆααΆααΆαααα’αααααααααΌαααΆααααααα αα αΌααααααααααααααΆααΆααα en αα·α ru ααα»αααααα
- tags_string β ααααΆαααΆααα’ααααΈαααααΆα
- ααΆαα’αΆα_ααΆαα
- α’αααβαα·αααα
- αα·αααα» - α αααΆααααααΆααα’αααααα
ααΌα αααααααααααΎ API αααα»αααΆαααΆααααααααααααααΆααααΎαααΆαααααααΈαααααααΉα 8 αα·ααΆααΈαααα»α 100 urlα
αααααΆααααΈααΎαααΆααααα·αααααααααααΎαααααΌαααΆαα αΎα ααΎαααααΌαααααΎαααΆαααΆ α αΎααααα αΌαααΆαα αααα»αααΌαααααΆααα·ααααααα αα·αααΆααααα αΆααΆαα½ααααααα
parser.py
import json
import sqlite3
import logging
from datetime import datetime
def parser(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute('PRAGMA synchronous = 0') # ΠΡΠΊΠ»ΡΡΠ°Π΅ΠΌ ΠΏΠΎΠ΄ΡΠ²Π΅ΡΠΆΠ΄Π΅Π½ΠΈΠ΅ Π·Π°ΠΏΠΈΡΠΈ, ΡΠ°ΠΊ ΡΠΊΠΎΡΠΎΡΡΡ ΡΠ²Π΅Π»ΠΈΡΠΈΠ²Π°Π΅ΡΡΡ Π² ΡΠ°Π·Ρ.
c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT,
lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
try:
for i in range(min, max):
try:
filename = "files\{}.json".format(i)
f = open(filename)
data = json.load(f)
(id, is_tutorial, time_published, title, content, comments_count, lang,
tags_string, reading_count, author, score) = data
# Π Π°Π΄ΠΈ Π»ΡΡΡΠ΅ΠΉ ΡΠΈΡΠ°Π΅ΠΌΠΎΡΡΠΈ Π±Π°Π·Ρ ΠΌΠΎΠΆΠ½ΠΎ ΠΏΡΠ΅Π½Π΅Π±ΡΠ΅ΡΡ ΡΠΈΡΠ°Π΅ΠΌΠΎΡΡΡΡ ΠΊΠΎΠ΄Π°. ΠΠ»ΠΈ Π½Π΅Ρ?
# ΠΡΠ»ΠΈ Π²Π°ΠΌ ΡΠ°ΠΊ ΠΊΠ°ΠΆΠ΅ΡΡΡ, ΠΌΠΎΠΆΠ½ΠΎ ΠΏΡΠΎΡΡΠΎ Π·Π°ΠΌΠ΅Π½ΠΈΡΡ ΠΊΠΎΡΡΠ΅ΠΆ Π°ΡΠ³ΡΠΌΠ΅Π½ΡΠΎΠΌ data. Π Π΅ΡΠ°ΡΡ Π²Π°ΠΌ.
c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
title, content, lang,
comments_count, reading_count,
score, is_tutorial,
tags_string))
f.close()
except IOError:
logging.info('FileNotExists')
continue
finally:
conn.commit()
start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)
αααα·αα·α
ααΆαααααααΈ ααΆα α»αααααα α’αααα’αΆα ααΆααααααα·αα·αα½αα ααα½αααΈαα·ααααααα
- αααα»αα αααα 490 αααααααΉααα»α ααΆααα 406 α’αααααααα»ααααααααααααΌαααΆαααΆαααα ααΆααααααΆα αααΎαααΆαααΆαααααααΆα (228) ααα’ααααααα ααΎ Habre ααααΌαααΆαααΆααα¬αα»αα
- ααΌαααααΆααα·ααααααααΆααααΌααααααΆαα’ααααααα·ααααααααΆαααΆααααααα 2.95 GB α αα αααα»ααααα»αααααααααααΆααααα αΆαα - 495 ααααΆααα
- ααα»ααα ααΆαα’ααααα·ααααα ααα½α α£α§.α¨α α€ ααΆαααα ααΎ Habre α αααα»αααΌαααααΉαα’αααααΆ ααΆαααααααααΆααααααΆαααα·αα·ααΈααΆαααααΆαααααααααΆααααα»αααααα
- α’ααααα·αααααααααΆαααα·αααΆααααα»ααα
ααΎ Habre -
α’αΆααΈα αααΆ - α¨α§α§α€ α’αααααα α’αααααβαααβααΆαβααΆαβαααααβααααα - 1448 ααΌαα’ααααααααα’αΆαααΆαα αααΎα - 1660841 ααααααααΆαα αααΎααα·ααΆαα’αααΈα’ααααα - 2444 ααααα
ααΆααΆααααααΎαααΆαα, αα
αααα»ααααα»ααααααααααααΌαα’ααααα·αααα 15 ααααΌα
α
αααΆααααααΆααααααΌαααΆαα 15 ααΆαα
αααΆααααααΆαα
ααααΌα 15 α’αΆα
ααααΌα 15 ααΆααα·ααΆααααΆ
ααααα: www.habr.com