Python Data Engineer |
, Kafka, Elasticsearch Hadoop, .
Data Engineer' !
.
, . Data Scientist', .
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from numpy.random import choice
import time
import numpy as np
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 YaBrowser/17.6.1.745 Yowser/2.5 Safari/537.36")
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.set_window_size(1024, 768)
hosts = ["35.190.***.***"]
keywords = {"music": "", "grammar": ""}
conformity = 0.9
condition = 1
def user_journey(host, journey_length, keywords, user_type, conformity):
driver.get("http://" + host)
el = driver.find_element_by_link_text("") # journey
el.click()
print driver.current_url
for i in xrange(journey_length):
try:
links = []
the_links = []
p = 0
P = []
links = driver.find_elements_by_class_name("b-ff-articles-shortlist__a") # url
if len(links) == 0:
links = driver.find_elements_by_class_name("b-ff-mobile-shortlist__a") # ,
if len(links) == 0:
links = driver.find_elements_by_class_name("b-ff-articles-tags__a") # ,
links[0].click
driver.current_url
the_links = driver.find_elements_by_partial_link_text(keywords.get(user_type)) # url,
the_link = choice(the_links, 1)[0] # url
links.append(the_link) #
p = (1-conformity)/float(len(links)-1) # url, , conformity
P = [p]*len(links) #
P[-1] = conformity # 'the link' conformity
l = choice(links, 1, p=P) #
time.sleep(np.random.poisson(5)) #
l[0].click()
print driver.current_url
except:
driver.close # , -
while condition == 1:
for host in hosts:
journey_length = np.random.poisson(5)
user_type = choice(keywords.keys(), 1)[0]
print user_type
user_journey(host, journey_length, keywords, user_type, conformity)
$ parallel -j0 python ::: cannon.py