-

   rss_rss_hh_new

 - e-mail

 

 -

 LiveInternet.ru:
: 17.03.2011
:
:
: 51

:


Python Data Engineer

, 03 2017 . 11:39 +
. , , , ? , ?


. , , :
, Kafka, Elasticsearch Hadoop, .

, :
Data Engineer' !
.
, . Data Scientist', .

, , . !

, . , , , : , ? , .

, -, , data engineer'; -, , .

, , , , .


, , , ad-hoc .

, , :
  1. ( .. ML)
  2. BI
  3. noSQL
  4. real-time
  5. command-line tools

. . , , , batch- real-time . , , .

, , . , .


, . , . :
  1. real-time, , - real-time,
  2. , , , , .

, . , , Selenium. , PhantomJS , headless, , .

, . --- (: , , ):

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from numpy.random import choice
import time
import numpy as np

dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 YaBrowser/17.6.1.745 Yowser/2.5 Safari/537.36")
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.set_window_size(1024, 768)

hosts = ["35.190.***.***"]
keywords = {"music": "", "grammar": ""}
conformity = 0.9
condition = 1

def user_journey(host, journey_length, keywords, user_type, conformity):
    driver.get("http://" + host)
    el = driver.find_element_by_link_text("") #   journey  
    el.click()
    print driver.current_url
    for i in xrange(journey_length):
        try:
            links = []
            the_links = []
            p = 0
            P = []
            links = driver.find_elements_by_class_name("b-ff-articles-shortlist__a") #    url  
            if len(links) == 0: 
                links = driver.find_elements_by_class_name("b-ff-mobile-shortlist__a") #    ,   
                if len(links) == 0:
                    links = driver.find_elements_by_class_name("b-ff-articles-tags__a") #     ,   
                    links[0].click
                    driver.current_url
            the_links = driver.find_elements_by_partial_link_text(keywords.get(user_type)) #   url,  
            the_link = choice(the_links, 1)[0] #      url
            links.append(the_link) #       
    
            p = (1-conformity)/float(len(links)-1) #      url,  ,        conformity
            P = [p]*len(links) #        
            P[-1] = conformity #    'the link'    conformity
            l = choice(links, 1, p=P) #        
            time.sleep(np.random.poisson(5)) #     
            l[0].click()
            print driver.current_url
        except:
            driver.close #  ,  -   

while condition == 1:
    for host in hosts:
        journey_length = np.random.poisson(5)
        user_type = choice(keywords.keys(), 1)[0]
        print user_type
        user_journey(host, journey_length, keywords, user_type, conformity)

? . ( ): , . .

: , , + , . , - .

, , . .



, 20-30 , gnu parallel. :
$ parallel -j0 python ::: cannon.py


  1. nginx.
  2. ip- .
  3. javascript .
  4. , .
  5. Kafka.
  6. ...

, 6 Data Engineer.
Original source: habrahabr.ru (comments, light).

https://habrahabr.ru/post/334756/

:  

: [1] []
 

:
: 

: ( )

:

  URL