python - login to webpage and save as png

Question

I have a website that requires authentication to access that I then want to make an image of. I'm using the following script -

import os
import requests
from subprocess import Popen, PIPE
from selenium import webdriver

abspath = lambda *p: os.path.abspath(os.path.join(*p))
ROOT = abspath(os.path.dirname(__file__))


def execute_command(command):
    result = Popen(command, shell=True, stdout=PIPE).stdout.read()
    if len(result) > 0 and not result.isspace():
        raise Exception(result)


def do_screen_capturing(url, screen_path, width, height):
    print "Capturing screen.."
    driver = webdriver.PhantomJS()
    # it save service log file in same directory
    # if you want to have log file stored else where
    # initialize the webdriver.PhantomJS() as
    # driver = webdriver.PhantomJS(service_log_path='/var/log/phantomjs/ghostdriver.log')
    driver.set_script_timeout(30)
    if width and height:
        driver.set_window_size(width, height)
    driver.get(url)
    driver.save_screenshot(screen_path)


def do_crop(params):
    print "Croping captured image.."
    command = [
        'convert',
        params['screen_path'],
        '-crop', '%sx%s+0+0' % (params['width'], params['height']),
        params['crop_path']
    ]
    execute_command(' '.join(command))


def do_thumbnail(params):
    print "Generating thumbnail from croped captured image.."
    command = [
        'convert',
        params['crop_path'],
        '-filter', 'Lanczos',
        '-thumbnail', '%sx%s' % (params['width'], params['height']),
        params['thumbnail_path']
    ]
    execute_command(' '.join(command))


def get_screen_shot(**kwargs):
    url = kwargs['url']
    width = int(kwargs.get('width', 1024)) # screen width to capture
    height = int(kwargs.get('height', 768)) # screen height to capture
    filename = kwargs.get('filename', 'screen.png') # file name e.g. screen.png
    path = kwargs.get('path', ROOT) # directory path to store screen

    crop = kwargs.get('crop', False) # crop the captured screen
    crop_width = int(kwargs.get('crop_width', width)) # the width of crop screen
    crop_height = int(kwargs.get('crop_height', height)) # the height of crop screen
    crop_replace = kwargs.get('crop_replace', False) # does crop image replace original screen capture?

    thumbnail = kwargs.get('thumbnail', False) # generate thumbnail from screen, requires crop=True
    thumbnail_width = int(kwargs.get('thumbnail_width', width)) # the width of thumbnail
    thumbnail_height = int(kwargs.get('thumbnail_height', height)) # the height of thumbnail
    thumbnail_replace = kwargs.get('thumbnail_replace', False) # does thumbnail image replace crop image?

    screen_path = abspath(path, filename)
    crop_path = thumbnail_path = screen_path

    if thumbnail and not crop:
        raise Exception, 'Thumnail generation requires crop image, set crop=True'

    do_screen_capturing(url, screen_path, width, height)

    if crop:
        if not crop_replace:
            crop_path = abspath(path, 'crop_'+filename)
        params = {
            'width': crop_width, 'height': crop_height,
            'crop_path': crop_path, 'screen_path': screen_path}
        do_crop(params)

        if thumbnail:
            if not thumbnail_replace:
                thumbnail_path = abspath(path, 'thumbnail_'+filename)
            params = {
                'width': thumbnail_width, 'height': thumbnail_height,
                'thumbnail_path': thumbnail_path, 'crop_path': crop_path}
            do_thumbnail(params)
    return screen_path, crop_path, thumbnail_path


if __name__ == '__main__':
    '''
        Requirements:
        Install NodeJS
        Using Node's package manager install phantomjs: npm -g install phantomjs
        install selenium (in your virtualenv, if you are using that)
        install imageMagick
        add phantomjs to system path (on windows)
    '''
    s = requests.Session()
    s.auth = ('username', 'password')
    r = s.get('https://website.com:8443/path/to/site', verify=False)
    url = r.text
    screen_path, crop_path, thumbnail_path = get_screen_shot(
        url=url, filename='test.png',
        crop=True, crop_replace=False,
        thumbnail=True, thumbnail_replace=False,
        thumbnail_width=200, thumbnail_height=150,
    )

I know it's authenticating because I can do print r.status and get 200 and r.headers returns the header. r.text gives encoding errors however. The above code is not failing but returning a blank image.

This is on a Windows machine.

edit - if I remove the requests to just hit the URL without logging in -

    url = 'https://website.com:8443/path/to/site'
    screen_path, crop_path, thumbnail_path = get_screen_shot(
        url=url, filename='test.png',
        crop=True, crop_replace=False,
        thumbnail=True, thumbnail_replace=False,
        thumbnail_width=200, thumbnail_height=150,
    )

It takes a screenshot of the login page. What I want to be able to do is get a screenshot of the page after logging in.

score 3 · Answer 1 · answered Mar 03 '15 at 02:17

3

It looks like save_screenshot() is called when the page is not completely loaded.

In this case, you need to explicitly wait for a, for example, login form to become visible. Example:

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

driver.get(url)

WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.ID, "myForm"))
)

driver.save_screenshot(screen_path)

answered Mar 03 '15 at 02:17

alecxe

462,703
120
1,088
1,195

so if I remove the authentication and just do a scrape of the page it returns the login page. Adding the wait is not doing anything. my session is logging me in, it's just trying to figure out how to use the session data to stop my screenshot from being the login page when I'm trying to authenticate past it. Hope that makes sense. – whoisearth Mar 03 '15 at 02:32
@whoisearth okay, thanks for the explanation. Can you reproduce the same problem using Chrome or Firefox instead of PhantomJS? Thanks. – alecxe Mar 03 '15 at 02:34
Using Firefox I get a blank browser window open a I do get a traceback. – whoisearth Mar 03 '15 at 02:51
@whoisearth this is something. What is inside the traceback? – alecxe Mar 03 '15 at 03:07
@whoisearth this can be due to compatibility issues, which selenium package version and firefox version are you using? Also, what about chrome? – alecxe Mar 03 '15 at 03:12
selenium v2.45.0, firefox v36, chrome v40. I"m confused though, reading more would the problem not be due to requests session data being different than the session data for selenium? I'm trying to find how but I'm guessing I need to somehow export the cookie after a successful login using requests and then get selenium to import that cookie? – whoisearth Mar 03 '15 at 03:34
@whoisearth yup, you need to save cookies after login and then add them back. As an option, you can pickle/unpickle them, see http://stackoverflow.com/questions/15058462/how-to-save-and-load-cookies-using-python-selenium-webdriver. Or, via a firefox profile: http://stackoverflow.com/questions/16504038/selenium-test-runs-wont-save-cookies – alecxe Mar 03 '15 at 12:52

python - login to webpage and save as png

1 Answers1