Анализируем требования рынка для data scientist |
def get_list_id_vacancies(area, text):
url_list = 'https://api.hh.ru/vacancies'
list_id = []
params = {'text': text, 'area': area}
r = requests.get(url_list, params=params)
found = json.loads(r.text)['found']; #кол-во всего найденных вакансий
if found <= 500: # API не отдает больше 500 вакансий за раз (на странице). Если найденно меньше 500 то получим все сразу.
params['per_page'] = found
r = requests.get(url_list, params=params)
data = json.loads(r.text)['items']
for vac in data:
list_id.append(vac['id'])
else:
i = 0;
while i <= 3: # если больше 500 то "перелистываем" страницы с 0 по 3 и получаем все вакансии поочереди. API не отдаст вам больше 2000 вакансий, поэтому тут захардкожено 3.
params['per_page'] = 500
params['page'] = i
r = requests.get(url_list, params=params)
if 200 != r.status_code:
break
data = json.loads(r.text)['items']
for vac in data:
list_id.append(vac['id'])
i += 1
return list_id
def get_vacancy(id):
url_vac = 'https://api.hh.ru/vacancies/%s'
r = requests.get(url_vac % id)
return json.loads(r.text)
{
"alternate_url": "https://hh.ru/vacancy/22285538",
"code": null,
"premium": false,
"description": "Мы занимаемся....",
"schedule": {
"id": "fullDay",
"name": "Полный день"
},
"suitable_resumes_url": null,
"site": {
"id": "hh",
"name": "hh.ru"
},
"billing_type": {
"id": "standard_plus",
"name": "Стандарт+"
},
"published_at": "2017-09-05T11:43:08+0300",
"test": null,
"accept_handicapped": true,
"experience": {
"id": "noExperience",
"name": "Нет опыта"
},
"address": {
"building": "36с7",
"city": "Москва",
"description": null,
"metro": {
"line_name": "Калининская",
"station_id": "8.470",
"line_id": "8",
"lat": 55.736478,
"station_name": "Парк Победы",
"lng": 37.514401
},
"metro_stations": [
{
"line_name": "Калининская",
"station_id": "8.470",
"line_id": "8",
"lat": 55.736478,
"station_name": "Парк Победы",
"lng": 37.514401
}
],
"raw": null,
"street": "Кутузовский проспект",
"lat": 55.739068,
"lng": 37.525432
},
"key_skills": [
{
"name": "Математическое моделирование"
},
{
"name": "Анализ рисков"
}
],
"allow_messages": true,
"employment": {
"id": "full",
"name": "Полная занятость"
},
"id": "22285538",
"response_url": null,
"salary": {
"to": 90000,
"gross": false,
"from": 50000,
"currency": "RUR"
},
"archived": false,
"name": "Математик/ Data scientist",
"contacts": null,
"employer": {
"logo_urls": {
"90": "https://hhcdn.ru/employer-logo/1680554.png",
"240": "https://hhcdn.ru/employer-logo/1680555.png",
"original": "https://hhcdn.ru/employer-logo-original/309546.png"
},
"vacancies_url": "https://api.hh.ru/vacancies?employer_id=1475513",
"name": "Аналитическое агентство Скориста",
"url": "https://api.hh.ru/employers/1475513",
"alternate_url": "https://hh.ru/employer/1475513",
"id": "1475513",
"trusted": true
},
"created_at": "2017-09-05T11:43:08+0300",
"area": {
"url": "https://api.hh.ru/areas/1",
"id": "1",
"name": "Москва"
},
"relations": [],
"accept_kids": false,
"response_letter_required": false,
"apply_alternate_url": "https://hh.ru/applicant/vacancy_response?vacancyId=22285538",
"quick_responses_allowed": false,
"negotiations_url": null,
"department": null,
"branded_description": null,
"hidden": false,
"type": {
"id": "open",
"name": "Открытая"
},
"specializations": [
{
"profarea_id": "14",
"profarea_name": "Наука, образование",
"id": "14.91",
"name": "Информатика, Информационные системы"
},
{
"profarea_id": "14",
"profarea_name": "Наука, образование",
"id": "14.141",
"name": "Математика"
}]
}
{
"description": "Мы занимаемся....",
"schedule": {
"id": "fullDay",
"name": "Полный день"
},
"accept_handicapped": true,
"experience": {
"id": "noExperience",
"name": "Нет опыта"
},
"key_skills": [
{
"name": "Математическое моделирование"
},
{
"name": "Анализ рисков"
}
],
"employment": {
"id": "full",
"name": "Полная занятость"
},
"id": "22285538",
"salary": {
"to": 90000,
"gross": false,
"from": 50000,
"currency": "RUR"
},
"name": "Математик/ Data scientist",
"employer": {
"name": "Аналитическое агентство Скориста",
},
"area": {
"name": "Москва"
},
"specializations": [
{
"profarea_id": "14",
"profarea_name": "Наука, образование",
"id": "14.91",
"name": "Информатика, Информационные системы"
},
{
"profarea_id": "14",
"profarea_name": "Наука, образование",
"id": "14.141",
"name": "Математика"
}]
}
def get_salary(vac): #зарплата не всегда заполена. Поэтому при обращение внутрь будет ошибка, для этого пишем отдельную функцию, которая вернет словарь с None, если данные пустые.
if vac['salary'] is None:
return {'currency':None , 'from':None,'to':None,'gross':None}
else:
return {'currency':vac['salary']['currency'],
'from':vac['salary']['from'],
'to':vac['salary']['to'],
'gross':vac['salary']['gross']}
def get_connection():
conn = pymysql.connect(host='localhost', port=3306, user='root', password='-', db='hh', charset="utf8")
return conn
def close_connection(conn):
conn.commit()
conn.close()
def insert_vac(conn, vac, text):
a = conn.cursor()
salary = get_salary(vac)
print(vac['id'])
a.execute("INSERT INTO vacancies (id, name_v, description, code_hh, accept_handicapped, \
area_v, employer, employment, experience, salary_currency, salary_from, salary_gross, \
salary_to, schedule_d, text_search) \
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(vac['id'], vac['name'], vac['description'],
vac['code'], vac['accept_handicapped'], vac['area']['name'],
vac['employer']['name'],
vac['employment']['name'], vac['experience']['name'], salary['currency'],
salary['from'], salary['gross'],
salary['to'], vac['schedule']['name'], text))
for key_skill in vac['key_skills']:
a.execute("INSERT INTO key_skills(vacancy_id, name) VALUES(%s, %s)",(vac['id'], key_skill['name']))
for spec in vac['specializations']:
a.execute("INSERT INTO specializations(vacancy_id, name, profarea_name) VALUES(%s, %s, %s)",
(vac['id'], spec['name'], spec['profarea_name']))
a.close()
text_search = 'data scientist'
list_id_vacs = get_list_id_vacancies(text_search)
vacs = []
for vac_id in list_id_vacs:
vacs.append(get_vacancy(vac_id))
conn = get_connection()
for vac in vacs:
insert_vac(conn, vac, text_search)
close_connection(conn)
def get_vac_descriptions(conn, text_search):
a = conn.cursor()
a.execute("SELECT description FROM vacancies WHERE text_search = %s", text_search)
descriptions = a.fetchall()
a.close
return descriptions
def get_popular_phrase(text, len, count_phrases):
phrase_counter = Counter()
words = nltk.word_tokenize(text.lower())
for phrase in nltk.ngrams(words, len):
if all(word not in string.punctuation for word in phrase):
phrase_counter[phrase] += 1
return phrase_counter.most_common(count_phrases)
descriptions = get_vac_descriptions(get_connection(), 'data scientist')
text = ''
for description in descriptions:
text = text + description[0]
result = get_popular_phrase(text, 1, 20)
for r in result:
print(" ".join(r[0]) + " - " + str(r[1]))
def main():
descriprions = get_vac_descriptions(get_connection(), 'data scientist')
text = ''
for descriprion in descriprions:
text = text + descriprion[0]
result = get_popular_phrase(text, 4, 20, stopwords)
for r in result:
print(" ".join(r[0]) + " - " + str(r[1]))
main()
def get_stopwords():
descriptions = get_vac_descriptions(get_connection(), 'повар') \
+ get_vac_descriptions(get_connection(), 'уборщица') + \
get_vac_descriptions(get_connection(), 'слесарь')
text = ''
for description in descriptions:
text = text + descriprion[0]
stopwords = []
list = get_popular_phrase(text, 1, None, 200) #размер списка стоп слов
for i in list:
stopwords.append(i[0][0])
return stopwords
for description in descriptions:
if detect(description[0]) != 'en':
text = text + description[0]
Комментировать | « Пред. запись — К дневнику — След. запись » | Страницы: [1] [Новые] |