Scrape google result description

Question:
Im trying to neatly display google result descriptions but It throws in \xa0 every time theres “…”. My code is as follows:

import requests
import urllib
import pandas as pd
import pprint
import shutil
from requests_html import HTML
from requests_html import HTMLSession
global terminal_size
global cols
global rows

terminal_size = shutil.get_terminal_size(fallback=(120, 50))
cols = terminal_size.columns
rows = terminal_size.lines

query = input("SEARCH: ")

def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)
      
def get_results(query):
    query = urllib.parse.quote_plus(query)
    try:
      pages = int(input("PAGES: "))
    except ValueError:
      print("PAGES must be a number!")
      clear()
      os.system('from main import clear; clear()')
    num = pages*10
    response = get_source(f"https://www.google.co.uk/search?q={query}&num={num}")
    
    return response
  
def parse_results(response):
    
    css_identifier_result = ".tF2Cxc"
    css_identifier_title = "h3"
    css_identifier_link = ".yuRUbf a"
    css_identifier_text = ".VwiC3b"
    
    results = response.html.find(css_identifier_result)

    output = []
    
    for result in results:

        item = {
            'Title': pprint.pformat(result.find(css_identifier_title, first=True).text, indent=0, width=250, depth=None, compact=False, sort_dicts=False),
            'Link': pprint.pformat(result.find(css_identifier_link, first=True).attrs['href'], indent=0, width=250, depth=None, compact=False, sort_dicts=False),
            'Text': pprint.pformat(result.find(css_identifier_text, first=True).text, indent=0, width=250, depth=None, compact=False, sort_dicts=False)
        }

        output.append(item)
        
    for out in output:
      print("\nTitle: " + out.get("Title") + "\nLink: " + out.get("Link") + "\nInfo: " + out.get("Text"))
  
def google_search(query):
    response = get_results(query)
    return parse_results(response)

google_search(query)

Here is what Its returning:

SEARCH: test
PAGES: 2

Title: 'Test.com: Home'
Link: 'https://test.com/'
Info: 'Want to test your internet upload and download speeds? ... Looking for a test and certification management solution for you business or organization?'

Title: 'Speedtest by Ookla - The Global Broadband Speed Test'
Link: 'https://www.speedtest.net/'
Info: 'Use Speedtest on all your devices with our free desktop and mobile apps.'

Title: 'Test Definition & Meaning - Merriam-Webster'
Link: 'https://www.merriam-webster.com/dictionary/test'
Info: 'verb ; 1 · to put to test or proof : try. test out your strength ; 3 · to use tests as a way to analyze or identify. test for copper.'

Title: 'Test - Wikipedia'
Link: 'https://en.wikipedia.org/wiki/Test'
Info: "Test (assessment), an educational assessment intended to measure the respondents' knowledge or other abilities\xa0..."

Title: 'Test Definition & Meaning - Dictionary.com'
Link: 'https://www.dictionary.com/browse/test'
Info: 'A test is a collection of questions, tasks, or problems that are designed to see if a person understands a subject or to measure their ability to do something.'

Title: 'test - Wiktionary'
Link: 'https://en.wiktionary.org/wiki/test'
Info: 'To challenge. · To refine (gold, silver, etc.) · To put to the proof; to prove the truth, genuineness, or quality of by experiment, or by some principle or\xa0...'

Title: 'Testing resources - COVID.gov'
Link: 'https://www.covid.gov/tests'
Info: 'At-\u2060home tests are available for sale around the U.S. Check with local retailers and pharmacies to see ... My COVID-\u206019 Home Test Kits are about to expire.'

Title: 'COVID-19 Testing: What You Need to Know - CDC'
Link: 'https://www.cdc.gov/coronavirus/2019-ncov/symptoms-testing/testing.html'
Info: 'These tests detect viral genetic material, which may stay in your body for up to 90 days after you test positive. Therefore, you should not use a NAAT if you\xa0...'

Title: 'Fast.com: Internet Speed Test'
Link: 'https://fast.com/'
Info: "How fast is your download speed? In seconds, FAST.com's simple Internet speed test will estimate your ISP speed."

Title: 'Mobile-Friendly Test - Google Search Console'
Link: 'https://search.google.com/test/mobile-friendly'
Info: 'test code. About this tool. Test how easily a visitor can use your page on a mobile device. Just enter a page URL to see how your page scores.'

Title: 'Take a Test'
Link: 'https://implicit.harvard.edu/implicit/takeatest.html'
Info: "Preliminary Information. On the next page you'll be asked to select an Implicit Association Test (IAT) from a list of possible topics ."

Title: 'COVID-19 Testing and Locations | MinuteClinic - CVS'
Link: 'https://www.cvs.com/minuteclinic/covid-19-testing'
Info: 'CVS Health is offering no cost coronavirus testing. Get a rapid COVID test for same day results, or a molecular lab test (PCR/NAAT) for results within days.'

Title: 'TEST | Home - Springer'
Link: 'https://www.springer.com/journal/11749'
Info: 'TEST is an international journal of Statistics and Probability, sponsored by the Spanish Society of Statistics and Operations Research (www.seio.es). Its ..'

Title: 'Test IO: QA Testing-as-a-Service'
Link: 'https://test.io/'
Info: 'Test IO delivers a full range of web, mobile, and IoT testing, delivered as a service.'

Title: 'Testing.com: Order Lab Tests and Blood Tests Online'
Link: 'https://www.testing.com/'
Info: 'Testing.com is a trusted health resource designed to help patients and caregivers easily order and understand the many lab tests that are a vital part of\xa0...'

Title: 'The Praxis Tests - ETS'
Link: 'https://www.ets.org/praxis/site.html'
Info: "I am a test taker. I'm entering an educator preparation program or seeking certification. Select. I am part of an educator preparation program."

Title: 'At-Home OTC COVID-19 Diagnostic Tests - FDA'
Link: 'https://www.fda.gov/medical-devices/coronavirus-covid-19-and-medical-devices/home-otc-covid-19-diagnostic-tests'
Info: '7 days ago — When using a COVID-19 antigen diagnostic test, the FDA recommends repeat testing following a negative result, whether you have symptoms or not,\xa0...'

Heres what I want:

SEARCH: test
PAGES: 2

Title: 'Test.com: Home'
Link: 'https://test.com/'
Info: 'Want to test your internet upload and download speeds? ... Looking for a test and certification management solution for you business or organization?'

Title: 'Speedtest by Ookla - The Global Broadband Speed Test'
Link: 'https://www.speedtest.net/'
Info: 'Use Speedtest on all your devices with our free desktop and mobile apps.'

Title: 'Test Definition & Meaning - Merriam-Webster'
Link: 'https://www.merriam-webster.com/dictionary/test'
Info: 'verb ; 1 · to put to test or proof : try. test out your strength ; 3 · to use tests as a way to analyze or identify. test for copper.'

Title: 'Test - Wikipedia'
Link: 'https://en.wikipedia.org/wiki/Test'
Info: "Test (assessment), an educational assessment intended to measure the respondents' knowledge or other abilities ..."

Title: 'Test Definition & Meaning - Dictionary.com'
Link: 'https://www.dictionary.com/browse/test'
Info: 'A test is a collection of questions, tasks, or problems that are designed to see if a person understands a subject or to measure their ability to do something.'

Title: 'test - Wiktionary'
Link: 'https://en.wiktionary.org/wiki/test'
Info: 'To challenge. · To refine (gold, silver, etc.) · To put to the proof; to prove the truth, genuineness, or quality of by experiment, or by some principle or ...'

Title: 'Testing resources - COVID.gov'
Link: 'https://www.covid.gov/tests'
Info: 'At-home tests are available for sale around the U.S. Check with local retailers and pharmacies to see ... My COVID-19 Home Test Kits are about to expire.'

Title: 'COVID-19 Testing: What You Need to Know - CDC'
Link: 'https://www.cdc.gov/coronavirus/2019-ncov/symptoms-testing/testing.html'
Info: 'These tests detect viral genetic material, which may stay in your body for up to 90 days after you test positive. Therefore, you should not use a NAAT if you ...'

Title: 'Fast.com: Internet Speed Test'
Link: 'https://fast.com/'
Info: "How fast is your download speed? In seconds, FAST.com's simple Internet speed test will estimate your ISP speed."

Title: 'Mobile-Friendly Test - Google Search Console'
Link: 'https://search.google.com/test/mobile-friendly'
Info: 'test code. About this tool. Test how easily a visitor can use your page on a mobile device. Just enter a page URL to see how your page scores.'

Title: 'Take a Test'
Link: 'https://implicit.harvard.edu/implicit/takeatest.html'
Info: "Preliminary Information. On the next page you'll be asked to select an Implicit Association Test (IAT) from a list of possible topics ."

Title: 'COVID-19 Testing and Locations | MinuteClinic - CVS'
Link: 'https://www.cvs.com/minuteclinic/covid-19-testing'
Info: 'CVS Health is offering no cost coronavirus testing. Get a rapid COVID test for same day results, or a molecular lab test (PCR/NAAT) for results within days.'

Title: 'TEST | Home - Springer'
Link: 'https://www.springer.com/journal/11749'
Info: 'TEST is an international journal of Statistics and Probability, sponsored by the Spanish Society of Statistics and Operations Research (www.seio.es). Its ..'

Title: 'Test IO: QA Testing-as-a-Service'
Link: 'https://test.io/'
Info: 'Test IO delivers a full range of web, mobile, and IoT testing, delivered as a service.'

Title: 'Testing.com: Order Lab Tests and Blood Tests Online'
Link: 'https://www.testing.com/'
Info: 'Testing.com is a trusted health resource designed to help patients and caregivers easily order and understand the many lab tests that are a vital part of ...'

Title: 'The Praxis Tests - ETS'
Link: 'https://www.ets.org/praxis/site.html'
Info: "I am a test taker. I'm entering an educator preparation program or seeking certification. Select. I am part of an educator preparation program."

Title: 'At-Home OTC COVID-19 Diagnostic Tests - FDA'
Link: 'https://www.fda.gov/medical-devices/coronavirus-covid-19-and-medical-devices/home-otc-covid-19-diagnostic-tests'
Info: '7 days ago — When using a COVID-19 antigen diagnostic test, the FDA recommends repeat testing following a negative result, whether you have symptoms or not, ...'

Since xa0 does not really affect the three dots at end, why not just find if it exists and remove it.

Thats what I’m trying to find out how to do i tried .replace(), and remove() but they didnt do a thing. Im so sorry I’m still a beginner.

Oh well I’m myself too much of a beginner in python, I tried some if statements and rfind + substring methods to achieve it but it gave invalid syntax error so I just wrote that statement without answer, thinking you might know how to do it :laughing: I’ll check some other ways if I find one, or if I could find solution to that invalid syntax error.

Edit: I just found my mistake which gave invalid syntax, turns out I was using && which is and operator in javascript but python uses simple and not &&, Makes it even funnier :laughing:

&& is actually the operator used by many, many programming languages, not just JavaScript (Probably the majority of them. Operators often don’t change too much between languages, most languages have the standard +, -, *, /, = operators and the standard ==, !=, <, <=, >, >=, &&, || logical operators).

I found my previous approach, which I was supposed to post earlier, ineffective; So here’s a better approach I found:

if item.get("Text").endswith("..."):
  item["Text"] = " ".join(item.get("Text").rsplit("\xa0", 1))
   
print(item.get("Text"))

This will replace last occuring “\xa0” with " ", if the description ends with “…”
Also, add this case before output.append

It dont work the out put stays the same

I have perfected it.
Here is the code:

import requests
import urllib
import pandas as pd
import pprint
import time
from termcolor import colored, cprint
from requests_html import HTML
from requests_html import HTMLSession
from colorama import Fore,Back,Style

query = input(colored(f"{Fore.BLUE}S{Fore.RED}E{Fore.LIGHTYELLOW_EX}A{Fore.BLUE}R{Fore.RED}C{Fore.LIGHTYELLOW_EX}H{Fore.BLUE}:{Fore.WHITE} ", attrs=['bold']))

def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)
def get_results(query):
    query = urllib.parse.quote_plus(query)
    try:
      pages = int(input(colored(f"{Fore.BLUE}P{Fore.RED}A{Fore.LIGHTYELLOW_EX}G{Fore.BLUE}E{Fore.RED}S{Fore.LIGHTYELLOW_EX}:{Fore.WHITE} ", attrs=['bold'])))
    except ValueError:
      print("PAGES must be a number!")
      clear()
    num = pages*10
    response = get_source(f"https://www.google.co.uk/search?q={query}&num={num}")

    return response
  
def parse_results(response):
    
    css_identifier_result = ".tF2Cxc"
    css_identifier_title = "h3"
    css_identifier_link = ".yuRUbf a"
    css_identifier_text = ".VwiC3b"
    
    results = response.html.find(css_identifier_result)

    output = []
    
    for result in results:

        item = {
            'Title': result.find(css_identifier_title, first=True).text,
            'Link': result.find(css_identifier_link, first=True).attrs['href'],
            'Text': result.find(css_identifier_text, first=True).text
        }
        
        output.append(item)
      
    for out in output:
      print(colored(f"\n{Fore.RED}{out.get('Title')}", attrs=['bold']))
      print(colored(f"{Fore.BLUE}{out.get('Link')}", attrs=['underline']))
      print(colored(f"{Fore.GREEN}{out.get('Text')}"))
      time.sleep(0.10)
  
def google_search(query):
    response = get_results(query)
    return parse_results(response)

google_search(query)

Repl link: https://replit.com/@JonathanHowie/GoogleSearch?v=1

1 Like

Did you remove pretty print?

Btw happy to know you solved it yourself :slight_smile:

1 Like

Thanks. And yes I did remove pprint

This topic was automatically closed 7 days after the last reply. New replies are no longer allowed.