docs/.vitepress/dist/rebel_coding/step3.html

<!DOCTYPE html>
<html lang="en-US" dir="ltr">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width,initial-scale=1">
    <title>Python | DreamFreely Library</title>
    <meta name="description" content="Community Conscious Creations">
    <meta name="generator" content="VitePress v2.0.0-alpha.15">
    <link rel="preload stylesheet" href="/assets/style.tE5om2fv.css" as="style">
    <link rel="preload stylesheet" href="/vp-icons.css" as="style">
    
    <script type="module" src="/assets/app.D2von411.js"></script>
    <link rel="preload" href="/assets/inter-roman-latin.Di8DUHzh.woff2" as="font" type="font/woff2" crossorigin="">
    <link rel="modulepreload" href="/assets/chunks/theme.CAK4ts65.js">
    <link rel="modulepreload" href="/assets/chunks/framework.j4Nev8bF.js">
    <link rel="modulepreload" href="/assets/rebel_coding_step3.md.CFJMlHnF.lean.js">
    <script id="check-dark-mode">(()=>{const e=localStorage.getItem("vitepress-theme-appearance")||"auto",a=window.matchMedia("(prefers-color-scheme: dark)").matches;(!e||e==="auto"?a:e==="dark")&&document.documentElement.classList.add("dark")})();</script>
    <script id="check-mac-os">document.documentElement.classList.toggle("mac",/Mac|iPhone|iPod|iPad/i.test(navigator.platform));</script>
  </head>
  <body>
    <div id="app"><div class="Layout" data-v-1df9f90f><!--[--><!--]--><!--[--><span tabindex="-1" data-v-331ec75c></span><a href="#VPContent" class="VPSkipLink visually-hidden" data-v-331ec75c>Skip to content</a><!--]--><!----><header class="VPNav" data-v-1df9f90f data-v-da52a441><div class="VPNavBar" data-v-da52a441 data-v-70946a35><div class="wrapper" data-v-70946a35><div class="container" data-v-70946a35><div class="title" data-v-70946a35><div class="VPNavBarTitle has-sidebar" data-v-70946a35 data-v-1e38c6bc><a class="title" href="/" data-v-1e38c6bc><!--[--><!--]--><!----><span data-v-1e38c6bc>DreamFreely Library</span><!--[--><!--]--></a></div></div><div class="content" data-v-70946a35><div class="content-body" data-v-70946a35><!--[--><!--]--><div class="VPNavBarSearch search" data-v-70946a35><!----></div><nav aria-labelledby="main-nav-aria-label" class="VPNavBarMenu menu" data-v-70946a35 data-v-39714824><span id="main-nav-aria-label" class="visually-hidden" data-v-39714824> Main Navigation </span><!--[--><!--[--><a class="VPLink link VPNavBarMenuLink" href="/welcome/" tabindex="0" data-v-39714824 data-v-52a1d768><!--[--><span data-v-52a1d768>Welcome</span><!--]--></a><!--]--><!--[--><a class="VPLink link VPNavBarMenuLink" href="/welcome/support.html" tabindex="0" data-v-39714824 data-v-52a1d768><!--[--><span data-v-52a1d768>Support</span><!--]--></a><!--]--><!--]--></nav><!----><div class="VPNavBarAppearance appearance" data-v-70946a35 data-v-6c893767><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-6c893767 data-v-5337faa4 data-v-1d5665e3><span class="check" data-v-1d5665e3><span class="icon" data-v-1d5665e3><!--[--><span class="vpi-sun sun" data-v-5337faa4></span><span class="vpi-moon moon" data-v-5337faa4></span><!--]--></span></span></button></div><div class="VPSocialLinks VPNavBarSocialLinks social-links" data-v-70946a35 data-v-0394ad82 data-v-d07f11e6><!--[--><a class="VPSocialLink no-icon" href="https://canin.dreamfreely.org" aria-label="org" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-org"></span></a><a class="VPSocialLink no-icon" href="https://shop.dreamfreely.org" aria-label="kitsu" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-kitsu"></span></a><a class="VPSocialLink no-icon" href="https://appdeveloperone.com/DreamFreely" aria-label="zcool" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-zcool"></span></a><!--]--></div><div class="VPFlyout VPNavBarExtra extra" data-v-70946a35 data-v-bf2fac68 data-v-42cb505d><button type="button" class="button" aria-haspopup="true" aria-expanded="false" aria-label="extra navigation" data-v-42cb505d><span class="vpi-more-horizontal icon" data-v-42cb505d></span></button><div class="menu" data-v-42cb505d><div class="VPMenu" data-v-42cb505d data-v-25a6cce8><!----><!--[--><!--[--><!----><div class="group" data-v-bf2fac68><div class="item appearance" data-v-bf2fac68><p class="label" data-v-bf2fac68>Appearance</p><div class="appearance-action" data-v-bf2fac68><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-bf2fac68 data-v-5337faa4 data-v-1d5665e3><span class="check" data-v-1d5665e3><span class="icon" data-v-1d5665e3><!--[--><span class="vpi-sun sun" data-v-5337faa4></span><span class="vpi-moon moon" data-v-5337faa4></span><!--]--></span></span></button></div></div></div><div class="group" data-v-bf2fac68><div class="item social-links" data-v-bf2fac68><div class="VPSocialLinks social-links-list" data-v-bf2fac68 data-v-d07f11e6><!--[--><a class="VPSocialLink no-icon" href="https://canin.dreamfreely.org" aria-label="org" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-org"></span></a><a class="VPSocialLink no-icon" href="https://shop.dreamfreely.org" aria-label="kitsu" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-kitsu"></span></a><a class="VPSocialLink
<span class="line"><span>    pip install lxml</span></span></code></pre></div><p>Error: If you get an error when trying to install LXML, that is totally natural and reasonable. Sometimes, some may say, that&#39;s the benefit of using a tool such as Beautiful Soup, it manages many dependencies, so that new users don&#39;t have to.</p><p>Though in truth, the effort required to supply LXML&#39;s dependencies are relatively minimal.</p><p>The package depends on a series of <code>c</code> files; for Mac users, admittedly, this may require acquiring and updating XCode to include their Command Line Tools package.</p><p>For Windows users may have their own issues, regarding Visual C++ components; notice that LXML is dependent on C-language packages.</p><p>If you run into any issues, this is your chance to check out what solutions others have found using your favorite search engine.</p><p>And if still unable to resolve the errors you receive, please reach out to <a href="mailto:canin@dreamfreely.org" target="_blank" rel="noreferrer">canin@dreamfreely.org</a>!</p><h3 id="create-new-python-file" tabindex="-1">Create New Python File <a class="header-anchor" href="#create-new-python-file" aria-label="Permalink to “Create New Python File”"></a></h3><p>Phew! We got through that entire process.</p><p>Congratuluation!!!</p><p>You&#39;ve done some great work so far; we&#39;re navigating the command-line to build a custom toolset.</p><p>That is no small accomplishment!</p><p>Next up, we start building.</p><p>Open up Notepad, or your favorite text editor, and create new file; naming it however you like, though with the <code>.py</code> extention at the end.</p><h3 id="import-libraries" tabindex="-1">Import Libraries <a class="header-anchor" href="#import-libraries" aria-label="Permalink to “Import Libraries”"></a></h3><p>Our process for building our scraper file is very similar to the steps we took when building our webpage.</p><p>First we need to gather our necessary tools.</p><p>On the first line of our file we will import our first package by typing the command <code>import requests</code>.</p><p>Yup, it is that easy; so next we will import the tools we need from LXML with the following command:</p><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span>    from lxml import html</span></span></code></pre></div><p>Feels almost magically simple doesn&#39;t it ?</p><p>Lastly, lets grab one more toolset by adding the line</p><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span>    from pprint import pprint as ppr</span></span></code></pre></div><p>This is a tool that will allow us to print our data in a more readable format.</p><p>So let&#39;s get to scraping!!</p><h3 id="get-site-requests" tabindex="-1">Get Site (requests) <a class="header-anchor" href="#get-site-requests" aria-label="Permalink to “Get Site (requests)”"></a></h3><p>What website do you want to scrape?</p><p>Mind you, some websites load their data using JavaScript (many websites do, in fact.)</p><p>And these websites will require additional tools to scrape.</p><p>Nonetheless, the command to <em>scrape</em> a website is as follows:</p><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span>    root = requests.get(&#39;https://www.linux.org&#39;)</span></span></code></pre></div><p>Operations will happen in the backgro
<span class="line"><span>        d = {}</span></span>
<span class="line"><span>        title = i.xpath(&#39;.//td[1]/*/a/font/text()&#39;)</span></span>
<span class="line"><span>        d[&#39;title&#39;] = title[0].strip()</span></span>
<span class="line"><span>        d[&#39;link&#39;] = i.xpath(&#39;.//td[1]/*/a/@href&#39;)[0]</span></span>
<span class="line"><span>        ppr(d)</span></span></code></pre></div><p>We use a <em>for-loop</em> to run through the first 5 items in our list of items; and the first thing we do is create an empty dictionary in which to store our desired information.</p><p>We do this so that we can more easily access this information later.</p><p>Next, we use XPATH to specify the information we&#39;re after.</p><p>XPATH returns a list of elements by default; and if there are not items, it will return an empty list.</p><p>If there is one item, it will return a list with one item; and so in our next line, we extract that singular item and apply the <code>strip()</code> method to remove any excess empty space on either side of our news acquired <code>title</code>.</p><p>On the next line we shorten this process a bit, by simply adding the index position <code>[0]</code> to the end of our <code>xpath</code> command.</p><p>Lastly we use the Python tool <em>pretty print</em> to display our newly acquired data.</p><p>In order to run our code, we navigate to our file&#39;s location; hopefully you&#39;ve saved it in our Virtual Environment&#39;s folder for ease of use.</p><p>And with our virtual environment activated we will run the command <code>python myFirstScrape.py</code>.</p><p>Though using whatever name you save your file as; having remembered the <code>.py</code> extension at the end.</p><p>Y&#39;all just wrote your first web scraper!!!</p><p>Pour your a delicious glass of your favorite beverage or commence any other suitably celebrative action ~ cause y&#39;all just did that!</p><h1 id="more-about-python" tabindex="-1">More About Python <a class="header-anchor" href="#more-about-python" aria-label="Permalink to “More About Python”"></a></h1><hr><p>We&#39;ve glossed over quite a bit just to get ourselves up and running.</p><p>The example script provided in the Rebel Coding startScraping repository goes a bit deeper; so definitely check that out.</p><p>Though now you know how to use the Python <code>requests</code> package to mechanically grab websites; and you know how to use LXML to read the code from those sites!</p><p>Now let&#39;s wrap up by learning about the <em>full stack</em>, by which many of these sites are built and run.</p><h1 id="python-classes-js-objects" tabindex="-1">Python Classes &amp; JS Objects <a class="header-anchor" href="#python-classes-js-objects" aria-label="Permalink to “Python Classes &amp; JS Objects”"></a></h1><hr><h1 id="python-datetime-structure" tabindex="-1">Python DateTime Structure <a class="header-anchor" href="#python-datetime-structure" aria-label="Permalink to “Python DateTime Structure”"></a></h1><hr><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span>    for i in items[:5]:</span></span>
<span class="line"><span>        d = {}</span></span>
<span class="line"><span>        title = i.xpath(&#39;.//td[1]/*/a/font/text()&#39;)</span></span>
<span class="line"><span>        d[&#39;title&#39;] = title[0].strip()</span></span>
<span class="line"><span>        d[&#39;link&#39;] = i.xpath(&#39;.//td[1]/*/a/@href&#39;)[0]</span></span>
<span class="line"><span>        date = i.xpath(&#39;.//td[2]/font/text()&#39;)</span></span>
<span class="line"><span>        time = i.xpath(&#39;.//td[4]/font/span/font/text()&#39;)</span></span>
<span class="line"><span>        time_complete = &quot; &quot;.join(date + time)</span></span>
<span class="line"><span>        format_date = &#39;%m/%d/%Y %I:%M %p&#39;</span></span>
<span class="line"><span>        d[&#39;real_date&#39;] = datetime.strptime(time_complete, format_date)</span></span>
<span class="line"><span>        ppr(d)</span></span></code></pre></div><h1 id="reading-writing-csv-json" tabindex="-1">Reading &amp; Writing CSV/JSON <a class="header-anchor" href="#reading-writing-csv-json" aria-label="Permalink to “Reading &amp; Writing CSV/JSON”"></a></h1><hr></div></div></main><footer class="VPDocFooter" data-v-7011f0d8 data-v-e257564d><!--[--><!--]--><!----><nav class="prev-next" aria-labelledby="doc-footer-aria-label" data-v-e257564d><span class="visually-hidden" id="doc-footer-aria-label" data-v-e257564d>Pager</span><div class="pager" data-v-e257564d><a class="VPLink link pager-link prev" href="/rebel_coding/step2.html" data-v-e257564d><!--[--><span class="desc" data-v-e257564d>Previous page</span><span class="title" data-v-e257564d>Step 2: JavaScript</span><!--]--></a></div><div class="pager" data-v-e257564d><a class="VPLink link pager-link next" href="/rebel_coding/step4.html" data-v-e257564d><!--[--><span class="desc" data-v-e257564d>Next page</span><span class="title" data-v-e257564d>Step 4: The Full Stack</span><!--]--></a></div></nav></footer><!--[--><!--]--></div></div></div><!--[--><!--]--></div></div><!----><!--[--><!--]--></div></div>
    <script>window.__VP_HASH_MAP__=JSON.parse("{\"api-examples.md\":\"x63WI4Pt\",\"df_guide_1_basic_psych.md\":\"CeMcMmuh\",\"df_guide_2_medi_vibes.md\":\"DXqrKVuP\",\"df_guide_3_with_loneliness.md\":\"CCm6CpFp\",\"df_guide_4_enter_alchemy.md\":\"C_rmpGLw\",\"df_guide_5_test_ethic.md\":\"CLeadJfH\",\"df_guide_6_social_physics.md\":\"DLXBEao5\",\"df_guide_7_quantum_realm.md\":\"DLdBPsJT\",\"df_guide_8_topol.md\":\"Ba9e4UzJ\",\"df_guide_appendices.md\":\"B6_t-HyW\",\"df_guide_index.md\":\"CEtKw1LU\",\"divinv_1-whiteness.md\":\"Gp4URLax\",\"divinv_2-native.md\":\"Vv8EGUun\",\"divinv_3-latinx.md\":\"BM4bdmUR\",\"divinv_4-black.md\":\"CUehWxOs\",\"divinv_5-onward.md\":\"CEQkr_Yq\",\"divinv_appendices.md\":\"Bmbs9w04\",\"divinv_index.md\":\"Bqo_SiH4\",\"exocto_analytics.md\":\"BKJsN1AA\",\"exocto_appendices.md\":\"Dni294_f\",\"exocto_automation.md\":\"CN2PTACp\",\"exocto_cooltools.md\":\"BTNgrA0k\",\"exocto_index.md\":\"BnPavi-q\",\"exocto_licences.md\":\"DzIeU3HF\",\"exocto_mailinglists.md\":\"CnM_HnlB\",\"exocto_payments.md\":\"CrBxLlhh\",\"exocto_seo101.md\":\"DviR7H5g\",\"exocto_socialmedia.md\":\"C39W_iye\",\"exocto_websites.md\":\"q-QXL0De\",\"index.1.md\":\"BirqA0z4\",\"index.md\":\"CpG2gthS\",\"markdown-examples.md\":\"CUSHEXel\",\"mempath.bak_able-ism.md\":\"DEEoY8Vi\",\"mempath.bak_appendices.md\":\"CmQnH-55\",\"mempath.bak_gender-studies.md\":\"Bm3aYqIc\",\"mempath.bak_index.md\":\"CjZxMHAK\",\"mempath.bak_notes.huggingface.md\":\"Khx3__Av\",\"mempath.bak_onward.md\":\"BVZnlB_v\",\"mempath.bak_openai.playground.md\":\"DWJTSwWP\",\"mempath.bak_outline.huggingface.md\":\"rtMna0fM\",\"mempath.bak_power-dynamics.md\":\"BpdS-0SU\",\"mempath.bak_racism.md\":\"DZ9tBx6B\",\"mempath.bak_sexism.md\":\"BEBA033J\",\"mempath_appendices.md\":\"Czbekzrn\",\"mempath_building.md\":\"DiuxWTcR\",\"mempath_egg.md\":\"CisKm_IF\",\"mempath_enjoying.md\":\"B4curKmU\",\"mempath_flight.md\":\"CVDxb1-C\",\"mempath_flying.md\":\"B6DOnD8x\",\"mempath_hatchling.md\":\"VVGsZ3N1\",\"mempath_helping.md\":\"P__Igwr5\",\"mempath_index.md\":\"Cvas8zRV\",\"mempath_learning.md\":\"BgKY3mQI\",\"mempath_onward.md\":\"QQAwPgA6\",\"rebel_coding_appendices.md\":\"hih51f2p\",\"rebel_coding_index.md\":\"Q72dPgpm\",\"rebel_coding_orientation.md\":\"DBUP6Scl\",\"rebel_coding_step1.md\":\"UGFgpiDv\",\"rebel_coding_step2.md\":\"g93kTiCg\",\"rebel_coding_step3.md\":\"CFJMlHnF\",\"rebel_coding_step4.md\":\"cGlIZ8RL\",\"rebel_coding_step5.md\":\"D7qbvCYY\",\"rebel_coding_step6.md\":\"KTPIgYna\",\"rebel_coding_step7.md\":\"D0XOrhEO\",\"rebel_coding_step8.md\":\"K9SkXD8n\",\"rebel_coding_termintro.md\":\"FZ5Hz2Ty\",\"rebel_coding_v2.md\":\"BQnisPze\",\"rebel_coding_v2orient.md\":\"Cv5u-Osk\",\"welcome_canin.md\":\"CHq_tepX\",\"welcome_dreamfreely.md\":\"Bxqk9nt5\",\"welcome_index.md\":\"DuIGE3qw\",\"welcome_support.md\":\"CYUo6QRX\"}");window.__VP_SITE_DATA__=JSON.parse("{\"lang\":\"en-US\",\"dir\":\"ltr\",\"title\":\"DreamFreely Library\",\"description\":\"Community Conscious Creations\",\"base\":\"/\",\"head\":[],\"router\":{\"prefetchLinks\":true},\"appearance\":true,\"themeConfig\":{\"nav\":[{\"text\":\"Welcome\",\"link\":\"/welcome/\"},{\"text\":\"Support\",\"link\":\"/welcome/support\"}],\"sidebar\":[{\"text\":\"Welcome\",\"items\":[{\"text\":\"Chibu / Hola / Hello\",\"link\":\"/welcome/\"},{\"text\":\"About DreamFreely\",\"link\":\"/welcome/dreamfreely\"},{\"text\":\"About Canin Carlos\",\"link\":\"/welcome/canin\"}]},{\"text\":\"Rebel Coding 101\",\"items\":[{\"text\":\"Intro to Coding\",\"link\":\"/rebel_coding/\"},{\"text\":\"Orientation\",\"link\":\"/rebel_coding/orientation\"},{\"text\":\"Intro to CLI\",\"link\":\"/rebel_coding/termintro\"},{\"text\":\"Step 1: HTML & CSS\",\"link\":\"/rebel_coding/step1\"},{\"text\":\"Step 2: JavaScript\",\"link\":\"/rebel_coding/step2\"},{\"text\":\"Step 3: Python Scrapers\",\"link\":\"/rebel_coding/step3\"},{\"text\":\"Step 4: The Full Stack\",\"link\":\"/rebel_coding/step4\"},{\"text\":\"Appendices\",\"link\":\"/rebel_coding/appendices\"}]},{\"text\":\"Rebel Coding 102\",\"items\":[{\"text\":\"Reviewing the Full-Stack\
    
  </body>
</html>