æ°å¹Žã®ãªãªãŽã£ãšãçµãã£ããšããç§ã¯äœãããããšããªãã£ãã®ã§ãããã©ãã (ããã³é¢é£ãã©ãããã©ãŒã ) ãããã¹ãŠã®èšäºãã³ã³ãã¥ãŒã¿ãŒã«ããŠã³ããŒãããŠæ¢çŽ¢ããããšã«ããŸããã
èå³æ·±ã話ãããã€ããããŸããã 12 ã€ç®ã¯ããµã€ãã® XNUMX 幎éã«ãããèšäºã®åœ¢åŒãšãããã¯ã®éçºã§ãã ããšãã°ãããã€ãã®ãããã¯ã®ãã€ããã¯ã¹ã¯éåžžã«ç€ºåçã§ãã ç¶ã - ã«ããã®äžã
解æããã»ã¹
ããã«ãã©ã®ããã«çºå±ããããç解ããã«ã¯ã圌ã®ãã¹ãŠã®èšäºã«ç®ãéããããããã¡ã¿æ å ± (æ¥ä»ãªã©) ãæœåºããå¿ èŠããããŸããã ãã¹ãŠã®èšäºãžã®ãªã³ã¯ã¯ãhabrahabr.ru/post/337722/ãã®ããã«èŠããçªå·ã¯å³å¯ã«é åºä»ããããŠããããããã€ãã¹ã¯ç°¡åã§ããã æåŸã®æçš¿ã®çªå·ã 350 ãããããã«å°ããããšãããã£ãã®ã§ãèãããããã¹ãŠã®ããã¥ã¡ã³ã ID ãã«ãŒãã§èª¿ã¹ãŸãã (Python ã³ãŒã)ã
import numpy as np
from multiprocessing import Pool
with Pool(100) as p:
docs = p.map(download_document, np.arange(350000))
æ©èœ download_document
察å¿ãã ID ãæã€ããŒãžãããŒãããHTML æ§é ããæå³ã®ããæ
å ±ãæœåºããããšããŸãã
import requests
from bs4 import BeautifulSoup
def download_document(pid):
""" Download and process a Habr document and its comments """
# вÑгÑÑзка ЎПкÑЌеМÑа
r = requests.get('https://habrahabr.ru/post/' +str(pid) + '/')
# паÑÑОМг ЎПкÑЌеМÑа
soup = BeautifulSoup(r.text, 'html5lib') # instead of html.parser
doc = {}
doc['id'] = pid
if not soup.find("span", {"class": "post__title-text"}):
# ÑакПе бÑваеÑ, еÑлО ÑÑаÑÑÑ ÐœÐµ ÑÑÑеÑÑвПвала ОлО ÑЎалеМа
doc['status'] = 'title_not_found'
else:
doc['status'] = 'ok'
doc['title'] = soup.find("span", {"class": "post__title-text"}).text
doc['text'] = soup.find("div", {"class": "post__text"}).text
doc['time'] = soup.find("span", {"class": "post__time"}).text
# create other fields: hubs, tags, views, comments, votes, etc.
# ...
# ÑПÑ
ÑаМеМОе ÑезÑлÑÑаÑа в ПÑЎелÑÐœÑй Ñайл
fname = r'files/' + str(pid) + '.pkl'
with open(fname, 'wb') as f:
pickle.dump(doc, f)
解æã®éçšã§ãããã€ãã®æ°ããããšãçºèŠããŸããã
ãŸããããã»ããµå ã®ã³ã¢æ°ãããå€ãã®ããã»ã¹ãäœæããŠãç¡é§ã ãšããããšã§ãã ããããç§ã®å ŽåããªãœãŒã¹ãå¶éããŠããã®ã¯ããã»ããµã§ã¯ãªããããã¯ãŒã¯ã§ããã100 åã®ããã»ã¹ã¯ 4 åãããšãã° 20 åã®ããã»ã¹ãããé«éã«åäœããããšãããããŸããã
次ã«ãäžéšã®æçš¿ã«ã¯ç¹æ®æåã®çµã¿åãããå«ãŸããŠããŸãããããšãã°ãã%&#@ãã®ãããªå©æ²è¡šçŸã§ãã ããã¯æããã«ãªã£ã html.parser
æåã«äœ¿çšãã ã¯ãçµã¿åããã«åå¿ããŸã &#
ããã HTML ãšã³ãã£ãã£ã®å§ãŸãã§ããããšãèãããšãæ®å¿µã§ãã ç§ã¯ãã§ã«é»éè¡ãè¡ãã€ããã§ãããããã©ãŒã©ã ã§ã¯ããŒãµãŒãå€æŽããã ãã§ãããšææ¡ãããŸããã
第äžã«ã65927 åãé€ããŠãã¹ãŠã®åºçç©ãã¢ã³ããŒãããããšãã§ããŸããã çªå· 162075ã275987ãããã³ 338586 ã®ããã¥ã¡ã³ãã¯ããŠã€ã«ã¹å¯Ÿçããã°ã©ã ã«ãã£ãŠå³åº§ã«åé€ãããŸããã ãããã¯ãããããæªæã®ãã PDF ãããŠã³ããŒããã JavaScript ãã§ãŒã³ããã©ãŠã¶ ãã©ã°ã€ã³ã®ã»ããã®åœ¢åŒã® SMS ã©ã³ãµã ãŠã§ã¢ãiPhone ãåèµ·åããã CrashSafari.com ãµã€ãã«é¢ããèšäºã§ãã ãã®åŸãã¢ã³ããŠã€ã«ã¹ã¯ã·ã¹ãã ã¹ãã£ã³äžã«å¥ã®èšäºãçºèŠããŸãããæçš¿ XNUMX ã¯ããŠãŒã¶ãŒã®ããã»ããµã䜿çšããŠæå·é貚ãæ¡æããããã ã¹ãã¢ã® Web ãµã€ãäžã®ã¹ã¯ãªããã«é¢ãããã®ã§ãã ãããã£ãŠããŠã€ã«ã¹å¯Ÿçã®æ©èœã¯ååã§ãããšèããããŸãã
ãã©ã€ããèšäºã¯ãæœåšçãªæ倧æ°ã§ãã 166307 件ã®ååã«ãããªãããšãå€æããŸããã æ®ãã«ã€ããŠãHabr æ°ã¯ãããŒãžãå€ããåé€ãããããŸãã¯ãŸã£ããååšããªãã£ãããšããéžæè¢ãäžããŠããŸãã ãŸããäœã§ãèµ·ãããŸãã
èšäºã®ã¢ããããŒãã«ç¶ããŠæè¡çãªäœæ¥ãè¡ãããŸãããããšãã°ãåºçæ¥ãã21 幎 2006 æ 10 æ¥åå 47 æ XNUMX åããšãã圢åŒããæšæºåœ¢åŒã«å€æããå¿
èŠããããŸããã datetime
ãããã³ã12,8kããã¥ãŒ - 12800ããã®æ®µéã§ãããã«ããã€ãã®ã€ã³ã·ãã³ããæããã«ãªããŸããã æãé¢çœãã®ã¯ãæ祚æ°ãšããŒã¿åã«é¢ä¿ãããã®ã§ããå€ãæçš¿ã«ã¯ int ãªãŒããŒãããŒãããããããã 65535 祚ãåãåããŸããã
ãã®çµæãèšäºã®ããã¹ã (ç»åãªã) 㧠1.5 ã¡ã¬ãã€ããã¡ã¿æ å ±ãå«ãã³ã¡ã³ãã§ããã« 3 ã¡ã¬ãã€ããèšäºã«é¢ããã¡ã¿æ å ±ã§çŽ XNUMX ã¡ã¬ãã€ããå¿ èŠã«ãªããŸããã ããã¯å®å šã« RAM ã«ä¿åã§ããã®ã§ãç§ã«ãšã£ãŠã¯å¬ããé©ãã§ããã
ç§ã¯èšäºã®åæãããã¹ããã®ãã®ããã§ã¯ãªããæ¥ä»ãã¿ã°ããããé²èŠ§æ°ãããããªã©ã®ã¡ã¿æ å ±ããå§ããŸããã 圌女ã¯å€ãã®ããšã話ãããšãã§ããããšãããããŸããã
ããã©ããã«ã®éçºåå
ãã®ãµã€ãã®èšäºã¯ 2006 幎ããå ¬éãããŠããŸãã æãéäžçã«è¡ãããã®ã¯ 2008 幎ãã 2016 幎ã§ãã
ãããã®èšäºãããŸããŸãªææã«ã©ãã ã掻çºã«èªãŸããããè©äŸ¡ããã®ã¯ããã»ã©ç°¡åã§ã¯ãããŸããã 2012 幎以åã®ããã¹ãã«ã¯ããå€ãã®ã³ã¡ã³ããšè©äŸ¡ãå«ãŸããŠããŸãããæ°ããããã¹ãã»ã©é²èŠ§æ°ãšããã¯ããŒã¯æ°ãå€ããªããŸãã ãããã®ææšãåãããã«åäœ (åæž) ããã®ã¯ã2015 幎㮠XNUMX åã ãã§ãã ãããããçµæžçãæ¿æ²»çå±æ©ã®ç¶æ³ã§ãèªè ã®é¢å¿ã¯ IT ããã°ãããããçã¿ã䌎ãåé¡ãžãšç§»ã£ãŠããã®ã§ãããã
èšäºèªäœã«å ããŠãããã«å€ãã®ã³ã¡ã³ããããŠã³ããŒãããŸããã ã³ã¡ã³ãã¯6äžä»¶ãã£ããããã®ãã¡240äžä»¶ãçŠæ¢ãããïŒãUFOãé£æ¥ããŠããã®ç¢æãããã«å ¬éãããïŒã ã³ã¡ã³ãã®äŸ¿å©ãªç¹æ§ã¯ãã³ã¡ã³ãã«ã¿ã€ã ã¹ã¿ã³ããããããšã§ãã ã³ã¡ã³ãã®æéã調æ»ããããšã§ãèšäºããã€èªãŸãããã倧ãŸãã«ææ¡ã§ããŸãã
ã»ãšãã©ã®èšäºã¯ãåå 10 æããååŸ 20 æãŸã§ã®éãã€ãŸããå·çãšã³ã¡ã³ãã®äž¡æ¹ãè¡ãããŠããããšãå€æããŸããã ã¢ã¹ã¯ã¯ã®å žåçãªå€åæ¥ã ããã¯ãããã«ãå°éçãªç®çã§èªãŸããŠããããšããããŠãããä»äºãå 延ã°ãã«ããè¯ãæ¹æ³ã§ããããšãæå³ããŠããå¯èœæ§ããããŸãã ã¡ãªã¿ã«ããã®æå»ååžã¯ããã«åµèšåœåããçŸåšã«è³ããŸã§å®å®ããŠããŸãã
ãã ããã³ã¡ã³ãã®ã¿ã€ã ã¹ã¿ã³ãã®äž»ãªå©ç¹ã¯ãæå»ã§ã¯ãªããèšäºã®ãæå¹æéãã®æéã§ãã èšäºã®å ¬éããã³ã¡ã³ããŸã§ã®æéé åãèšç®ããŠã¿ãŸããã çŸåšãã³ã¡ã³ãã®äžå€®å€ (ç·ã®ç·) ã¯çŽ 20 æé以å ã«å°çããããšãããããŸããã å ¬éåŸã®åæ¥ã«ã¯ãå¹³åããŠãèšäºã«å¯Ÿããå šã³ã¡ã³ãã®åå匷ãæ®ãããŸãã ãã㊠75 æ¥éã§ãå šã³ã¡ã³ãã® 2010% ãæ®ãããŸãã åæã«ã以åã®èšäºã¯ããã«éãèªãŸããŸãããããšãã°ã6 幎ã«ã¯ãã³ã¡ã³ãã®ååãæåã® XNUMX æé以å ã«å±ããŸããã
ã³ã¡ã³ããé·ããªã£ãã®ã«ã¯é©ããŸãããã³ã¡ã³ãã®å¹³åæåæ°ã¯ãããã«ã®ç涯ã§ã»ãŒ XNUMX åã«ãªããŸããã
ã³ã¡ã³ããããç°¡åãªãã£ãŒãããã¯ã¯æ祚ã§ãã ä»ã®å€ãã®ãªãœãŒã¹ãšã¯ç°ãªããHabré ã§ã¯ãã©ã¹ã ãã§ãªããã€ãã¹ãå ¥åã§ããŸãã ããããèªè ã¯æåŸã®æ©äŒãããŸãå©çšããŸãããçŸåšã®å«ãã®å²åã¯å šæ祚æ°ã®çŽ 15% ã§ãã 以åã¯ãã£ãšãããŸããããæéãçµã€ã«ã€ããŠãèªè ã¯ãã芪åã«ãªããŸããã
ããã¹ãèªäœã¯æ代ãšãšãã«å€åããŠããŸããã ããšãã°ãå žåçãªããã¹ãã®é·ãã¯ãå±æ©ã«ããããããããµã€ãã®ç«ã¡äžãåœåããçå®ã«å¢å ãæ¢ããŸããã XNUMX 幎éã§ãããã¹ãã¯ã»ãŒ XNUMX åã®é·ãã«ãªããŸããã
ããã¹ãã®ã¹ã¿ã€ã«ã (äžæ¬¡è¿äŒŒçã«ã¯) å€æŽãããŸããã ããšãã°ãããã«ãååšããŠããæåã®æ°å¹Žéã¯ãããã¹ãå ã®ã³ãŒããšæ°åã®å²åãå¢å ããŸããã
ãµã€ãå šäœã®ãã€ããã¯ã¹ãç解ããåŸãããŸããŸãªãããã¯ã®äººæ°ãã©ã®ããã«å€åãããã枬å®ããããšã«ããŸããã ãããã¯ã¯ããã¹ãããèªåçã«éžæã§ããŸããããŸãæåã«ãè»èŒªã®åçºæã¯ã§ãããåèšäºã®èè ã«ãã£ãŠä»ããããæ¢æã®ã¿ã°ã䜿çšããŸãã 代衚çãªXNUMXã€ã®ãã¬ã³ãããã£ãŒãã«æããŠã¿ãŸããã ãGoogleãããŒãã¯åœåïŒãããã䞻㫠SEO ã«ããïŒåªå¢ã§ããããé·å¹Žã«ããã£ãŠãã®æ¯éã倱ãã€ã€ãããŸãã Javascript ã¯äººæ°ã®ãããã¯ã§ããããã£ãããšæé·ãç¶ããŠããŸãããæ©æ¢°åŠç¿ãæ¥éã«æ®åãå§ããã®ã¯ããæ°å¹Žã®ããšã§ãã äžæ¹ãLinux 㯠XNUMX 幎éãéããŠåæ§ã«é¢é£æ§ãä¿ã¡ç¶ããŠããŸãã
ãã¡ãããã©ã®ãããªãããã¯ãããå€ãã®èªè ã®æŽ»åãæ¹ãã€ããã®ãã«èå³ãæã¡ãŸããã åãããã¯ã®é²èŠ§æ°ãæ祚æ°ãã³ã¡ã³ãæ°ã®äžå€®å€ãèšç®ããŸããã äœãèµ·ãã£ããã¯æ¬¡ã®ãšããã§ãã
- æãé²èŠ§ãããŠãããããã¯: ArduinoãWeb ãã¶ã€ã³ãWeb éçºããã€ãžã§ã¹ãããªã³ã¯ãCSSãHTMLãHTML5ãnginxãã¢ã«ãŽãªãºã ã
- æããããããããããããã¯: vkontakteããŠãŒã¢ã¢ãjqueryãoperaãcãhtmlãweb éçºãhtml5ãcssãweb ãã¶ã€ã³ã
- æãè°è«ãããŠãããããã¯: ãªãã©ãã¹ã«ã€ããããªãŒã©ã³ã¹ãvkontakteãubuntuãä»äºãnokiaãnginxãarduinoãfirefoxã
ã¡ãªã¿ã«ããããã¯ãæ¯èŒããŠããã®ã§ãé »åºŠã§ã©ã³ã¯ä»ãããããšãã§ããŸã (çµæã次ã®ããã«æ¯èŒããããšãã§ããŸã)
- Habr ãååšããŠä»¥æ¥ãæã人æ°ã®ããã¿ã° (éé ) ã¯ãgoogleãandroidãjavascriptãmicrosoftãlinuxãphpãappleãjavaãpythonãããã°ã©ãã³ã°ãã¹ã¿ãŒãã¢ãããéçºãiosãã¹ã¿ãŒãã¢ããããœãŒã·ã£ã« ãããã¯ãŒã¯ã§ãã
- 2017 幎ã«æã人æ°ããã£ãã®ã¯ãJavaScriptãPythonãJavaãAndroidãéçºãLinuxãC++ãããã°ã©ãã³ã°ãPHPãC#ãiOSãæ©æ¢°åŠç¿ãæ å ±ã»ãã¥ãªãã£ãMicrosoftãReact ã§ããã
ãããã®è©äŸ¡ãæ¯èŒãããšããããšãã°ãPython ã®åå©ãš PHP ã®æ¶æ» ããŸãã¯ã¹ã¿ãŒãã¢ããé¢é£ã®ãããã¯ã®ãæ²æ²¡ããšæ©æ¢°åŠç¿ã®éçã«æ³šç®ããããšãã§ããŸãã
Habré ã®ãã¹ãŠã®ã¿ã°ã«ãããã»ã©æçœãªããŒãã«ã©ãŒãèšå®ãããŠããããã§ã¯ãããŸããã ããšãã°ãããã«ã¯äžåºŠã ãåºäŒã£ãããç§ã«ã¯é¢çœããšæãããåæ°åã®ã¿ã°ããããŸãã ã€ãŸãããã¢ã€ãã¢ã¯é²æ©ã®åååã§ããããããããã㌠ãã£ã¹ã¯ ã€ã¡ãŒãžããèµ·åãããããã¢ã€ãªã¯å·ããããã©ããããã¹ãŒããŒã¢ãªãŒã·ã¥ãããèžæ°æ©é¢ãããåææ¥ã«ããã¹ãããšãããç§ã¯æã£ãŠãããèæœãåšã«å ¥ã£ããããããããã€ãéãã®çµæã«ãªã£ããããé¢çœãã¿ã°ãæãã€ããªãã£ããã ãã®ãããªèšäºã®äž»é¡ã決å®ããã«ã¯ãã¿ã°ã ãã§ã¯ååã§ã¯ãããŸãããèšäºã®ããã¹ãã«å¯ŸããŠããŒãã®ã¢ããªã³ã°ãå®è¡ããå¿ èŠããããŸãã
èšäºå 容ã®è©³ããåæã¯æ¬¡åã®æçš¿ã§ã ãŸããèšäºã®å 容ã«åºã¥ããŠèšäºã®ããŒãžãã¥ãŒæ°ãäºæž¬ããã¢ãã«ãæ§ç¯ããŸãã 次ã«ãHabr ã®äœè ãšåãã¹ã¿ã€ã«ã§ããã¹ããçæããããã«ãã¥ãŒã©ã« ãããã¯ãŒã¯ãæããããšèããŠããŸãã ã ãã賌èªããŠãã ãã ð
PS ãããŠããããããŒãé³ã§ã
åºæïŒ habr.com