Yog tias koj cuam tshuam qhov kev ua tiav ntawm tsab ntawv, ces tag nrho cov ntaub ntawv yuav tsis mus qhov twg. Tom qab tag nrho, qhov kev cog lus tsuas yog ua tom qab tag nrho cov parsing.
Tau kawg, koj tuaj yeem hloov pauv rau cov ntaub ntawv tom qab txhua qhov ntxig, tab sis tom qab ntawd lub sijhawm ua tiav tsab ntawv yuav nce ntxiv.
Tom ntej no kuv pom cov neeg siv cov lus ua ke, uas kuv nyeem thiab pom ob peb lub neej hacks kom ceev cov txheej txheem no:
Siv multithreading speeds downloading lub sij hawm.
Koj tuaj yeem tau txais tsis yog tag nrho version ntawm habr, tab sis nws cov mobile version.
Piv txwv li, yog tias ib tsab xov xwm sib koom ua ke hauv desktop version hnyav 378 KB, tom qab ntawd hauv mobile version nws twb yog 126 KB.
Thib ob version. Ntau cov xov, ib ntus txwv los ntawm Habr
Yog li ntawd, kuv txiav txim siab tso tseg qhov kev tso tawm tam sim ntawm cov khoom ncaj qha rau hauv cov ntaub ntawv thiab, nco ntsoov cov kev daws teeb meem sib xyaw, kuv txiav txim siab siv cov ntaub ntawv, vim tias tsis muaj teeb meem nrog ntau txoj xov sau rau ib cov ntaub ntawv.
Habr pib txwv tsis pub siv ntau tshaj peb txoj xov.
Tshwj xeeb tshaj yog kev mob siab rau kom dhau mus rau Habr tuaj yeem xaus nrog kev txwv ip rau ob peb teev. Yog li koj yuav tsum siv 3 xov nkaus xwb, tab sis qhov no yog qhov zoo, txij li lub sijhawm los iterate ntau tshaj 100 kab lus raug txo los ntawm 26 mus rau 12 vib nas this.
Nws yog ib nqi sau cia hais tias qhov no version yog tsis ruaj tsis khov, thiab downloads ib ntus poob tawm ntawm ib tug loj tus naj npawb ntawm cov khoom.
async_v1.py
from bs4 import BeautifulSoup
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/post/{}".format(i)
try: r = requests.get(url)
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
# Запись заблокированных запросов на сервер
if (r.status_code == 503):
with open("Error503.txt", "a") as write_file:
write_file.write(str(i) + "n")
logging.warning('{} / 503 Error'.format(i))
# Если поста не существует или он был скрыт
if (r.status_code != 200):
logging.info("{} / {} Code".format(i, r.status_code))
return r.status_code
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html5lib')
try:
author = soup.find(class_="tm-user-info__username").get_text()
timestamp = soup.find(class_='tm-user-meta__date')
timestamp = timestamp['title']
content = soup.find(id="post-content-body")
content = str(content)
title = soup.find(class_="tm-article-title__text").get_text()
tags = soup.find(class_="tm-article__tags").get_text()
tags = tags[5:]
# Метка, что пост является переводом или туториалом.
tm_tag = soup.find(class_="tm-tags tm-tags_post").get_text()
rating = soup.find(class_="tm-votes-score").get_text()
except:
author = title = tags = timestamp = tm_tag = rating = "Error"
content = "При парсинге этой странице произошла ошибка."
logging.warning("Error parsing - {}".format(i))
with open("Errors.txt", "a") as write_file:
write_file.write(str(i) + "n")
# Записываем статью в json
try:
article = [i, timestamp, author, title, content, tm_tag, rating, tags]
with open(currentFile, "w") as write_file:
json.dump(article, write_file)
except:
print(i)
raise
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Необходимы параметры min и max. Использование: async_v1.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
# Если потоков >3
# то хабр банит ipшник на время
pool = ThreadPool(3)
# Отсчет времени, запуск потоков
start_time = datetime.now()
results = pool.map(worker, range(min, max))
# После закрытия всех потоков печатаем время
pool.close()
pool.join()
print(datetime.now() - start_time)
Thib peb version. Kawg
Thaum debugging qhov thib ob version, kuv pom tias Habr, tam sim ntawd, muaj API uas lub xov tooj ntawm lub vev xaib nkag mus. Nws thauj khoom sai dua li lub xov tooj ntawm tes, vim nws tsuas yog json xwb, uas tsis tas yuav tsum tau parsed. Thaum kawg, kuv txiav txim siab rov sau kuv tsab ntawv dua.
Yog li, tau pom qhov txuas no API, koj tuaj yeem pib parsing nws.
async_v2.py
import requests
import os, sys
import json
from multiprocessing.dummy import Pool as ThreadPool
from datetime import datetime
import logging
def worker(i):
currentFile = "files\{}.json".format(i)
if os.path.isfile(currentFile):
logging.info("{} - File exists".format(i))
return 1
url = "https://m.habr.com/kek/v1/articles/{}/?fl=ru%2Cen&hl=ru".format(i)
try:
r = requests.get(url)
if r.status_code == 503:
logging.critical("503 Error")
return 503
except:
with open("req_errors.txt") as file:
file.write(i)
return 2
data = json.loads(r.text)
if data['success']:
article = data['data']['article']
id = article['id']
is_tutorial = article['is_tutorial']
time_published = article['time_published']
comments_count = article['comments_count']
lang = article['lang']
tags_string = article['tags_string']
title = article['title']
content = article['text_html']
reading_count = article['reading_count']
author = article['author']['login']
score = article['voting']['score']
data = (id, is_tutorial, time_published, title, content, comments_count, lang, tags_string, reading_count, author, score)
with open(currentFile, "w") as write_file:
json.dump(data, write_file)
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Необходимы параметры min и max. Использование: asyc.py 1 100")
sys.exit(1)
min = int(sys.argv[1])
max = int(sys.argv[2])
# Если потоков >3
# то хабр банит ipшник на время
pool = ThreadPool(3)
# Отсчет времени, запуск потоков
start_time = datetime.now()
results = pool.map(worker, range(min, max))
# После закрытия всех потоков печатаем время
pool.close()
pool.join()
print(datetime.now() - start_time)
Nws muaj cov teb ntsig txog ob qho tib si rau tsab xov xwm nws tus kheej thiab rau tus kws sau ntawv uas tau sau nws.
lang yog hom lus uas sau ntawv. Txog tam sim no, nws tsuas muaj en thiab ru.
tags_string - tag nrho cov cim los ntawm tus ncej
nyeem_ suav
sau
qhab-nees rating.
Yog li, siv API, Kuv txo cov ntawv sau sijhawm rau 8 vib nas this ib 100 url.
Tom qab peb tau rub tawm cov ntaub ntawv peb xav tau, peb yuav tsum tau ua nws thiab nkag mus rau hauv cov ntaub ntawv. Kuv tsis muaj teeb meem nrog qhov no thiab:
pab parser.py
import json
import sqlite3
import logging
from datetime import datetime
def parser(min, max):
conn = sqlite3.connect('habr.db')
c = conn.cursor()
c.execute('PRAGMA encoding = "UTF-8"')
c.execute('PRAGMA synchronous = 0') # Отключаем подтверждение записи, так скорость увеличивается в разы.
c.execute("CREATE TABLE IF NOT EXISTS articles(id INTEGER, time_published TEXT, author TEXT, title TEXT, content TEXT,
lang TEXT, comments_count INTEGER, reading_count INTEGER, score INTEGER, is_tutorial INTEGER, tags_string TEXT)")
try:
for i in range(min, max):
try:
filename = "files\{}.json".format(i)
f = open(filename)
data = json.load(f)
(id, is_tutorial, time_published, title, content, comments_count, lang,
tags_string, reading_count, author, score) = data
# Ради лучшей читаемости базы можно пренебречь читаемостью кода. Или нет?
# Если вам так кажется, можно просто заменить кортеж аргументом data. Решать вам.
c.execute('INSERT INTO articles VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (id, time_published, author,
title, content, lang,
comments_count, reading_count,
score, is_tutorial,
tags_string))
f.close()
except IOError:
logging.info('FileNotExists')
continue
finally:
conn.commit()
start_time = datetime.now()
parser(490000, 490918)
print(datetime.now() - start_time)
Zoo, nyob rau hauv daim ntawv ntawm topsSab saum toj 15 tus kws sau ntawv Sab saum toj 15 los ntawm kev ntaus nqi Saum 15 nyeem Sab saum toj 15 Sib tham