áá±áž áá¬á!
ááá±á·áá»áœááºá¯ááºááá¯á·ááẠPython ááœááºáá±áá¬áá»á¬ážááá¯á¡á¯ááºá
á¯ááœá²á·ááŒááºážááŸáá·áºááŒááºáá±á¬ááºáááºááŒááºááŒááºážá¡ááœáẠtools áá»á¬ážááá¯á¡áá¯á¶ážááŒá¯ááŒááºážáá»áœááºážáá»ááºááŸá¯á¡áá±á«áºáá¯ááºáá±á¬ááºáá«áááºá áá±ážáá¬ážááá·áºá¡ááœááº
á¡á ááºá¡áá¬á¡áá á¡á ááœááºá áááºážááá¯ááºáá»á¬ážááᯠáááºááŸááºááŒáá«á áá¯á·á
- áá»á¬ážáááááŸá áºá¡ááá¯áẠáá±áá¬áá»á¬ážááᯠá¡á¯ááºá á¯ááœá²á·ááŒá®áž ááááºááŸá áºáá»áá¯ážáá¯á¶ážá ááœá±ážááœá¬ážááŸá¯ááºážá á¡áá¯á¶ážá á¯á¶áá±á¬ ááá¯ááºážááá áºáá»á¬ážááᯠááŒááºáá±á¬ááºááŒáá·áºáá«á
- á¡áá»áááºááá¯ááºážááá±áááºážá¡á á¬ážáá¯á¶ážá¡áááºáá»á¬ážááá¯ááŸá¬áá«á
- áá±áá¬ááŸá á¡áá»áááºáá¬ááá áºáá¯áá¯á¶ážááᯠá¡ááá¯ááºáž áá ááá¯ááºážááœá²ááŒá®áž áá áºáá¯á á®á¡ááœáẠáá»á¬ážááá áºáŠážá á®á á¡áá»á±á¬áºááŒá¬ážáá¯á¶ážá¡áááºááᯠááŸá¬áá«á ááœá±á·ááŸááá±á¬á¡áááºáá áºáá¯á á®á¡ááœááºá á¡áá»áááºááá¯ááºážááœáẠáááºážáááá¯ááºážááá áºáá»á¬ážááᯠááŒááºáá±á¬ááºááŒáá·áºáá«á
- áá áºááŸá áºáá»áŸáẠáá°áá»á¬ážá 50% ááᯠááœáŸááºážááŒá¯á¶áá¬ážáá±á¬ á¡áááºáááºáá»áŸááŸááááºááᯠááœááºáá»ááºááŒá®áž ááŒááºáá±á¬ááºááŒáá·áºáá« (ááŸá áºááá¯ááºážá¡ááœáẠáá¬áááºá¡áá»áá¯ážáá»áá¯ážááᯠáá»áœááºá¯ááºááá¯á· ááŒááºááœá±á·ááááº)á
- ááŒá¬ážáá¬ááá áºáá¯áá¯á¶ážá០4 ááŸá áºááᯠááœá±ážááŒá®áž á¡áááºááŸá áááá á¬áá¯á¶ážááŒáá·áº ááŒáá·áºáá±ááŒááºáž ááŸá áºá ááºááŸá áºááá¯ááºážá¡ááœáẠááŒááááºááŸáá·áº á¡áááºááŸá áá±á¬ááºáá¯á¶ážá á¬áá¯á¶ážááŒáá·áºá
- áá»á±á¬áºááŒá¬ážáá°áá»á¬áž (ááá¹áááá»á¬ážá á¡ááá¯áá±á¬áºáá»á¬ážá ááá¯ááºáá±á¬ááºáá»á¬ážá áá¯ááºááŸááºáá¬ááºáá±á¬ááºáá»á¬áž) ááᯠá á¬áááºážááŒá¯á á¯ááŒá®áž á¡áááºáá»á¬ážá ááœá±á·ááŒá±á¬ááºážááŸá¯á¡áá±á«áº áááºážááá¯á·áááœáŸááºážááá¯ážááŸá¯ááᯠá¡áá²ááŒááºáá«á á áááºáá°ážáá²á·áááºáá±á¬ááºáá«á
á áá¬ážáá¯á¶ážáááºážáááºážá áá¯ááºáá»á¬ážáá»á¬ážá
ááœá¬ážááŒáá¯ááºá¡á¶á·á
áá±áá¬ááᯠáá»á¬ážáá ááŸáá·áº ááŸá áºá¡ááá¯áẠá¡á¯ááºá á¯ááœá²á·ááŒá®áž ááááºááŸá áºáá»áá¯ážáá¯á¶ážá ááœá±ážááœá¬ážááŸá¯ááºážá á¡áá¯á¶ážá á¯á¶áá±á¬ ááá¯ááºážááá áºáá»á¬ážááᯠááŒááºáá±á¬ááºááŒáá·áºááŒáá«á áá¯á·á
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
years = np.arange(1880, 2011, 3)
datalist = 'https://raw.githubusercontent.com/wesm/pydata-book/2nd-edition/datasets/babynames/yob{year}.txt'
dataframes = []
for year in years:
dataset = datalist.format(year=year)
dataframe = pd.read_csv(dataset, names=['name', 'sex', 'count'])
dataframes.append(dataframe.assign(year=year))
result = pd.concat(dataframes)
sex = result.groupby('sex')
births_men = sex.get_group('M').groupby('year', as_index=False)
births_women = sex.get_group('F').groupby('year', as_index=False)
births_men_list = births_men.aggregate(np.sum)['count'].tolist()
births_women_list = births_women.aggregate(np.sum)['count'].tolist()
fig, ax = plt.subplots()
fig.set_size_inches(25,15)
index = np.arange(len(years))
stolb1 = ax.bar(index, births_men_list, 0.4, color='c', label='ÐÑжÑОМÑ')
stolb2 = ax.bar(index + 0.4, births_women_list, 0.4, alpha=0.8, color='r', label='ÐеМÑОМÑ')
ax.set_title('РПжЎаеЌПÑÑÑ Ð¿ÐŸ Ð¿ÐŸÐ»Ñ Ðž гПЎаЌ')
ax.set_xlabel('ÐПЎа')
ax.set_ylabel('РПжЎаеЌПÑÑÑ')
ax.set_xticklabels(years)
ax.set_xticks(index + 0.4)
ax.legend(loc=9)
fig.tight_layout()
plt.show()
áááá¯ááºážááœáẠáá±áááºážá¡á á¬ážáá¯á¶áž áá¬áááºáá»á¬ážááᯠááŸá¬ááœá±ááŒáá«á áá¯á·á
years = np.arange(1880, 2011)
dataframes = []
for year in years:
dataset = datalist.format(year=year)
dataframe = pd.read_csv(dataset, names=['name', 'sex', 'count'])
dataframes.append(dataframe)
result = pd.concat(dataframes)
names = result.groupby('name', as_index=False).sum().sort_values('count', ascending=False)
names.head(10)
áá±áá¬ááŸá á¡áá»áááºáá¬ááá áºáá¯áá¯á¶ážááᯠá¡ááá¯ááºáž áá ááá¯ááºážááœá²ááŒá®áž áá áºáá¯á á®á¡ááœáẠáá»á¬ážááá áºáŠážá á®á áá±áááºážá¡á á¬ážáá¯á¶ážá¡áááºááᯠááœá±á·ááá«áááºá ááœá±á·ááŸááá±á¬ á¡áááºáá áºáá¯á á®á¡ááœááºá áá»áœááºá¯ááºááá¯á·ááẠáááºážá ááá¯ááºážááá áºáá»á¬ážááᯠá¡áá»áááºááá¯ááºáž ááŒááºáá±á¬ááºááŒáá·áºáá«-
years = np.arange(1880, 2011)
part_size = int((years[years.size - 1] - years[0]) / 10) + 1
parts = {}
def GetPart(year):
return int((year - years[0]) / part_size)
for year in years:
index = GetPart(year)
r = years[0] + part_size * index, min(years[years.size - 1], years[0] + part_size * (index + 1))
parts[index] = str(r[0]) + '-' + str(r[1])
dataframe_parts = []
dataframes = []
for year in years:
dataset = datalist.format(year=year)
dataframe = pd.read_csv(dataset, names=['name', 'sex', 'count'])
dataframe_parts.append(dataframe.assign(years=parts[GetPart(year)]))
dataframes.append(dataframe.assign(year=year))
result_parts = pd.concat(dataframe_parts)
result = pd.concat(dataframes)
result_parts_sums = result_parts.groupby(['years', 'sex', 'name'], as_index=False).sum()
result_parts_names = result_parts_sums.iloc[result_parts_sums.groupby(['years', 'sex'], as_index=False).apply(lambda x: x['count'].idxmax())]
result_sums = result.groupby(['year', 'sex', 'name'], as_index=False).sum()
for groupName, groupLabels in result_parts_names.groupby(['name', 'sex']).groups.items():
group = result_sums.groupby(['name', 'sex']).get_group(groupName)
fig, ax = plt.subplots(1, 1, figsize=(18,10))
ax.set_xlabel('ÐПЎа')
ax.set_ylabel('РПжЎаеЌПÑÑÑ')
label = group['name']
ax.plot(group['year'], group['count'], label=label.aggregate(np.max), color='b', ls='-')
ax.legend(loc=9, fontsize=11)
plt.show()
ááŸá áºá áẠáá°áá»á¬ážá 50% ááᯠááœáŸááºážááŒá¯á¶áá¬ážáá±á¬ á¡áááºáááºáá»áŸááŸááááºááᯠááœááºáá»ááºááŒá®áž á€áá±áá¬ááᯠááŒááºáá±á¬ááºááŒáá·áºáá«-
dataframe = pd.DataFrame({'year': [], 'count': []})
years = np.arange(1880, 2011)
for year in years:
dataset = datalist.format(year=year)
csv = pd.read_csv(dataset, names=['name', 'sex', 'count'])
names = csv.groupby('name', as_index=False).aggregate(np.sum)
names['sum'] = names.sum()['count']
names['percent'] = names['count'] / names['sum'] * 100
names = names.sort_values(['percent'], ascending=False)
names['cum_perc'] = names['percent'].cumsum()
names_filtered = names[names['cum_perc'] <= 50]
dataframe = dataframe.append(pd.DataFrame({'year': [year], 'count': [names_filtered.shape[0]]}))
fig, ax1 = plt.subplots(1, 1, figsize=(22,13))
ax1.set_xlabel('ÐПЎа', fontsize = 12)
ax1.set_ylabel('РазМППбÑазОе ОЌеМ', fontsize = 12)
ax1.plot(dataframe['year'], dataframe['count'], color='r', ls='-')
ax1.legend(loc=9, fontsize=12)
plt.show()
ááŒá¬ážáá¬ááá áºáá¯áá¯á¶ážá០4 ááŸá áºááᯠááœá±ážááŒá®áž á¡áááºááŸá áááá á¬áá¯á¶ážááŒáá·áº ááŒáá·áºáá»á®ááŒá®áž á¡áááºá áá±á¬ááºáá¯á¶ážá á¬áá¯á¶ážááŒáá·áº ááŸá áºá ááºááŒáááŒáá«á áá¯á·á
from string import ascii_lowercase, ascii_uppercase
fig_first, ax_first = plt.subplots(1, 1, figsize=(14,10))
fig_last, ax_last = plt.subplots(1, 1, figsize=(14,10))
index = np.arange(len(ascii_uppercase))
years = [1944, 1978, 1991, 2003]
colors = ['r', 'g', 'b', 'y']
n = 0
for year in years:
dataset = datalist.format(year=year)
csv = pd.read_csv(dataset, names=['name', 'sex', 'count'])
names = csv.groupby('name', as_index=False).aggregate(np.sum)
count = names.shape[0]
dataframe = pd.DataFrame({'letter': [], 'frequency_first': [], 'frequency_last': []})
for letter in ascii_uppercase:
countFirst = (names[names.name.str.startswith(letter)].count()['count'])
countLast = (names[names.name.str.endswith(letter.lower())].count()['count'])
dataframe = dataframe.append(pd.DataFrame({
'letter': [letter],
'frequency_first': [countFirst / count * 100],
'frequency_last': [countLast / count * 100]}))
ax_first.bar(index + 0.3 * n, dataframe['frequency_first'], 0.3, alpha=0.5, color=colors[n], label=year)
ax_last.bar(index + bar_width * n, dataframe['frequency_last'], 0.3, alpha=0.5, color=colors[n], label=year)
n += 1
ax_first.set_xlabel('ÐÑква алÑавОÑа')
ax_first.set_ylabel('ЧаÑÑПÑа, %')
ax_first.set_title('ÐеÑÐ²Ð°Ñ Ð±Ñква в ОЌеМО')
ax_first.set_xticks(index)
ax_first.set_xticklabels(ascii_uppercase)
ax_first.legend()
ax_last.set_xlabel('ÐÑква алÑавОÑа')
ax_last.set_ylabel('ЧаÑÑПÑа, %')
ax_last.set_title('ÐПÑлеЎМÑÑ Ð±Ñква в ОЌеМО')
ax_last.set_xticks(index)
ax_last.set_xticklabels(ascii_uppercase)
ax_last.legend()
fig_first.tight_layout()
fig_last.tight_layout()
plt.show()
áá»á±á¬áºááŒá¬ážáá°áá»á¬áž (ááá¹áááá»á¬ážá á¡ááá¯áá±á¬áºáá»á¬ážá ááá¯ááºáá±á¬ááºáá»á¬ážá áá¯ááºááŸááºáá¬ááºáá±á¬ááºáá»á¬áž) ááᯠá á¬áááºážááŒá¯á á¯ááŒá®áž áá¬áááºáá»á¬ážá ááœá±á·ááŒá±á¬ááºážááŸá¯á¡áá±á«áº áááºážááá¯á·á ááŒáá¬ááœáŸááºážááá¯ážááŸá¯ááᯠá¡áá²ááŒááºááŒáá«á áá¯á·á
celebrities = {'Frank': 'M', 'Britney': 'F', 'Madonna': 'F', 'Bob': 'M'}
dataframes = []
for year in years:
dataset = datalist.format(year=year)
dataframe = pd.read_csv(dataset, names=['name', 'sex', 'count'])
dataframes.append(dataframe.assign(year=year))
result = pd.concat(dataframes)
for celebrity, sex in celebrities.items():
names = result[result.name == celebrity]
dataframe = names[names.sex == sex]
fig, ax = plt.subplots(1, 1, figsize=(16,8))
ax.set_xlabel('ÐПЎа', fontsize = 10)
ax.set_ylabel('РПжЎаеЌПÑÑÑ', fontsize = 10)
ax.plot(dataframe['year'], dataframe['count'], label=celebrity, color='r', ls='-')
ax.legend(loc=9, fontsize=12)
plt.show()
áá±á·áá»áá·áºááŸá¯á¡ááœááºá áááºááẠáá¬áááºáá»á¬ážá ááœá±á·ááŒá±á¬ááºážááŸá¯á¡áá±á«áº áááºážááá¯á·áááŒáá¬ááœáŸááºážááá¯ážááŸá¯ááᯠááŸááºážááŸááºážáááºážáááºážá¡áá²ááŒááºáááºá¡ááœáẠáá±á¬ááºáá¯á¶ážááá°áá¬á០ááŒááºáá¬áááºáá¬ááŒááºáá¬ááŸá¯ááœáẠáá»á±á¬áºááŒá¬ážáá°áá»á¬ážááááá¬áááᯠááá·áºááœááºážááá¯ááºáááºá
áá®ááá¯áá¯ááºááŒááºážá¡á¬ážááŒáá·áº áá»áœááºáá±á¬áºááá¯á·áá²á· áááºááŸááºážáá»ááºááœá±á¡á¬ážáá¯á¶ážááᯠááŒá®ážááŒá±á¬ááºá¡á±á¬ááºááŒááºáá²á·áá«áááºá áá»áœááºá¯ááºááá¯á·ááẠPython ááœáẠáá±áá¬á¡á¯ááºá á¯ááœá²á·ááŒááºážááŸáá·áº áá¯á¶áá±á¬áºááŒááºážá¡ááœáẠáááááá¬áá»á¬ážááᯠá¡áá¯á¶ážááŒá¯ááŒááºážá áœááºážáááºááᯠáá®ááœááºáá²á·ááŒá®áž áá±áá¬ááŸáá·áº áá»áœááºá¯ááºááá¯á· áááºáááºáá¯ááºáá±á¬ááºááœá¬ážáá«áááºá áá°ááá¯ááºážááẠá¡áááºááá·áºáá¯ááºáá¬ážáá±á¬á ááŒááºáá¬ááŒááºáá¬áá±á¬ á¡áá»ááºá¡áááºáá»á¬ážá¡áá±á«áº á¡ááŒá±áá¶á áá±á¬ááºáá»ááºááœá²ááá¯ááºáááºá
áá°ááá¯ááºážá¡ááœáẠááá¯áá¯áá
source: www.habr.com