Files
Library/docs/.vitepress/dist/rebel_coding/step3.html

41 lines
45 KiB
HTML
Raw Normal View History

2026-01-09 23:05:52 -05:00
<!DOCTYPE html>
<html lang="en-US" dir="ltr">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>Python | DreamFreely Library</title>
<meta name="description" content="Community Conscious Creations">
<meta name="generator" content="VitePress v2.0.0-alpha.15">
<link rel="preload stylesheet" href="/assets/style.tE5om2fv.css" as="style">
<link rel="preload stylesheet" href="/vp-icons.css" as="style">
<script type="module" src="/assets/app.CNf1Hw3L.js"></script>
<link rel="preload" href="/assets/inter-roman-latin.Di8DUHzh.woff2" as="font" type="font/woff2" crossorigin="">
<link rel="modulepreload" href="/assets/chunks/theme.DVARhRSD.js">
<link rel="modulepreload" href="/assets/chunks/framework.BfS8l2sS.js">
<link rel="modulepreload" href="/assets/rebel_coding_step3.md.DPGBP964.lean.js">
<script id="check-dark-mode">(()=>{const e=localStorage.getItem("vitepress-theme-appearance")||"auto",a=window.matchMedia("(prefers-color-scheme: dark)").matches;(!e||e==="auto"?a:e==="dark")&&document.documentElement.classList.add("dark")})();</script>
<script id="check-mac-os">document.documentElement.classList.toggle("mac",/Mac|iPhone|iPod|iPad/i.test(navigator.platform));</script>
</head>
<body>
<div id="app"><div class="Layout" data-v-1df9f90f><!--[--><!--]--><!--[--><span tabindex="-1" data-v-331ec75c></span><a href="#VPContent" class="VPSkipLink visually-hidden" data-v-331ec75c>Skip to content</a><!--]--><!----><header class="VPNav" data-v-1df9f90f data-v-da52a441><div class="VPNavBar" data-v-da52a441 data-v-70946a35><div class="wrapper" data-v-70946a35><div class="container" data-v-70946a35><div class="title" data-v-70946a35><div class="VPNavBarTitle has-sidebar" data-v-70946a35 data-v-1e38c6bc><a class="title" href="/" data-v-1e38c6bc><!--[--><!--]--><!----><span data-v-1e38c6bc>DreamFreely Library</span><!--[--><!--]--></a></div></div><div class="content" data-v-70946a35><div class="content-body" data-v-70946a35><!--[--><!--]--><div class="VPNavBarSearch search" data-v-70946a35><!----></div><nav aria-labelledby="main-nav-aria-label" class="VPNavBarMenu menu" data-v-70946a35 data-v-39714824><span id="main-nav-aria-label" class="visually-hidden" data-v-39714824> Main Navigation </span><!--[--><!--[--><a class="VPLink link VPNavBarMenuLink" href="/welcome/" tabindex="0" data-v-39714824 data-v-52a1d768><!--[--><span data-v-52a1d768>Welcome</span><!--]--></a><!--]--><!--[--><a class="VPLink link VPNavBarMenuLink" href="/welcome/support.html" tabindex="0" data-v-39714824 data-v-52a1d768><!--[--><span data-v-52a1d768>Support</span><!--]--></a><!--]--><!--]--></nav><!----><div class="VPNavBarAppearance appearance" data-v-70946a35 data-v-6c893767><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-6c893767 data-v-5337faa4 data-v-1d5665e3><span class="check" data-v-1d5665e3><span class="icon" data-v-1d5665e3><!--[--><span class="vpi-sun sun" data-v-5337faa4></span><span class="vpi-moon moon" data-v-5337faa4></span><!--]--></span></span></button></div><div class="VPSocialLinks VPNavBarSocialLinks social-links" data-v-70946a35 data-v-0394ad82 data-v-d07f11e6><!--[--><a class="VPSocialLink no-icon" href="https://canin.dreamfreely.org" aria-label="bookalope" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-bookalope"></span></a><a class="VPSocialLink no-icon" href="https://shop.dreamfreely.org" aria-label="tina" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-tina"></span></a><a class="VPSocialLink no-icon" href="https://digisnaxx.com" aria-label="ghostery" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-ghostery"></span></a><a class="VPSocialLink no-icon" href="https://manifestingempathy.com" aria-label="hasura" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-hasura"></span></a><!--]--></div><div class="VPFlyout VPNavBarExtra extra" data-v-70946a35 data-v-bf2fac68 data-v-42cb505d><button type="button" class="button" aria-haspopup="true" aria-expanded="false" aria-label="extra navigation" data-v-42cb505d><span class="vpi-more-horizontal icon" data-v-42cb505d></span></button><div class="menu" data-v-42cb505d><div class="VPMenu" data-v-42cb505d data-v-25a6cce8><!----><!--[--><!--[--><!----><div class="group" data-v-bf2fac68><div class="item appearance" data-v-bf2fac68><p class="label" data-v-bf2fac68>Appearance</p><div class="appearance-action" data-v-bf2fac68><button class="VPSwitch VPSwitchAppearance" type="button" role="switch" title aria-checked="false" data-v-bf2fac68 data-v-5337faa4 data-v-1d5665e3><span class="check" data-v-1d5665e3><span class="icon" data-v-1d5665e3><!--[--><span class="vpi-sun sun" data-v-5337faa4></span><span class="vpi-moon moon" data-v-5337faa4></span><!--]--></span></span></button></div></div></div><div class="group" data-v-bf2fac68><div class="item social-links" data-v-bf2fac68><div class="VPSocialLinks social-links-list" data-v-bf2fac68 data-v-d07f11e6><!--[--><a class="VPSocialLink no-icon" href="https://canin.dreamfreely.org" aria-label="bookalope" target="_blank" rel="me noopener" data-v-d07f11e6 data-v-591a6b30><span class="vpi-social-bookalope"></span></a><a class
<span class="line"><span> pip install lxml</span></span></code></pre></div><p>Error: If you get an error when trying to install LXML, that is totally natural and reasonable. Sometimes, some may say, that&#39;s the benefit of using a tool such as Beautiful Soup, it manages many dependencies, so that new users don&#39;t have to.</p><p>Though in truth, the effort required to supply LXML&#39;s dependencies are relatively minimal.</p><p>The package depends on a series of <code>c</code> files; for Mac users, admittedly, this may require acquiring and updating XCode to include their Command Line Tools package.</p><p>For Windows users may have their own issues, regarding Visual C++ components; notice that LXML is dependent on C-language packages.</p><p>If you run into any issues, this is your chance to check out what solutions others have found using your favorite search engine.</p><p>And if still unable to resolve the errors you receive, please reach out to <a href="mailto:canin@dreamfreely.org" target="_blank" rel="noreferrer">canin@dreamfreely.org</a>!</p><h3 id="create-new-python-file" tabindex="-1">Create New Python File <a class="header-anchor" href="#create-new-python-file" aria-label="Permalink to “Create New Python File”"></a></h3><p>Phew! We got through that entire process.</p><p>Congratuluation!!!</p><p>You&#39;ve done some great work so far; we&#39;re navigating the command-line to build a custom toolset.</p><p>That is no small accomplishment!</p><p>Next up, we start building.</p><p>Open up Notepad, or your favorite text editor, and create new file; naming it however you like, though with the <code>.py</code> extention at the end.</p><h3 id="import-libraries" tabindex="-1">Import Libraries <a class="header-anchor" href="#import-libraries" aria-label="Permalink to “Import Libraries”"></a></h3><p>Our process for building our scraper file is very similar to the steps we took when building our webpage.</p><p>First we need to gather our necessary tools.</p><p>On the first line of our file we will import our first package by typing the command <code>import requests</code>.</p><p>Yup, it is that easy; so next we will import the tools we need from LXML with the following command:</p><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span> from lxml import html</span></span></code></pre></div><p>Feels almost magically simple doesn&#39;t it ?</p><p>Lastly, lets grab one more toolset by adding the line</p><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span> from pprint import pprint as ppr</span></span></code></pre></div><p>This is a tool that will allow us to print our data in a more readable format.</p><p>So let&#39;s get to scraping!!</p><h3 id="get-site-requests" tabindex="-1">Get Site (requests) <a class="header-anchor" href="#get-site-requests" aria-label="Permalink to “Get Site (requests)”"></a></h3><p>What website do you want to scrape?</p><p>Mind you, some websites load their data using JavaScript (many websites do, in fact.)</p><p>And these websites will require additional tools to scrape.</p><p>Nonetheless, the command to <em>scrape</em> a website is as follows:</p><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span> root = requests.get(&#39;https://www.linux.org&#39;)</span></span></code></pre></div><p>Operations will happen in the backgro
<span class="line"><span> d = {}</span></span>
<span class="line"><span> title = i.xpath(&#39;.//td[1]/*/a/font/text()&#39;)</span></span>
<span class="line"><span> d[&#39;title&#39;] = title[0].strip()</span></span>
<span class="line"><span> d[&#39;link&#39;] = i.xpath(&#39;.//td[1]/*/a/@href&#39;)[0]</span></span>
<span class="line"><span> ppr(d)</span></span></code></pre></div><p>We use a <em>for-loop</em> to run through the first 5 items in our list of items; and the first thing we do is create an empty dictionary in which to store our desired information.</p><p>We do this so that we can more easily access this information later.</p><p>Next, we use XPATH to specify the information we&#39;re after.</p><p>XPATH returns a list of elements by default; and if there are not items, it will return an empty list.</p><p>If there is one item, it will return a list with one item; and so in our next line, we extract that singular item and apply the <code>strip()</code> method to remove any excess empty space on either side of our news acquired <code>title</code>.</p><p>On the next line we shorten this process a bit, by simply adding the index position <code>[0]</code> to the end of our <code>xpath</code> command.</p><p>Lastly we use the Python tool <em>pretty print</em> to display our newly acquired data.</p><p>In order to run our code, we navigate to our file&#39;s location; hopefully you&#39;ve saved it in our Virtual Environment&#39;s folder for ease of use.</p><p>And with our virtual environment activated we will run the command <code>python myFirstScrape.py</code>.</p><p>Though using whatever name you save your file as; having remembered the <code>.py</code> extension at the end.</p><p>Y&#39;all just wrote your first web scraper!!!</p><p>Pour your a delicious glass of your favorite beverage or commence any other suitably celebrative action ~ cause y&#39;all just did that!</p><h1 id="more-about-python" tabindex="-1">More About Python <a class="header-anchor" href="#more-about-python" aria-label="Permalink to “More About Python”"></a></h1><hr><p>We&#39;ve glossed over quite a bit just to get ourselves up and running.</p><p>The example script provided in the Rebel Coding startScraping repository goes a bit deeper; so definitely check that out.</p><p>Though now you know how to use the Python <code>requests</code> package to mechanically grab websites; and you know how to use LXML to read the code from those sites!</p><p>Now let&#39;s wrap up by learning about the <em>full stack</em>, by which many of these sites are built and run.</p><h1 id="python-classes-js-objects" tabindex="-1">Python Classes &amp; JS Objects <a class="header-anchor" href="#python-classes-js-objects" aria-label="Permalink to “Python Classes &amp; JS Objects”"></a></h1><hr><h1 id="python-datetime-structure" tabindex="-1">Python DateTime Structure <a class="header-anchor" href="#python-datetime-structure" aria-label="Permalink to “Python DateTime Structure”"></a></h1><hr><div class="language-"><button title="Copy Code" class="copy"></button><span class="lang"></span><pre class="shiki shiki-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;" tabindex="0" dir="ltr"><code><span class="line"><span> for i in items[:5]:</span></span>
<span class="line"><span> d = {}</span></span>
<span class="line"><span> title = i.xpath(&#39;.//td[1]/*/a/font/text()&#39;)</span></span>
<span class="line"><span> d[&#39;title&#39;] = title[0].strip()</span></span>
<span class="line"><span> d[&#39;link&#39;] = i.xpath(&#39;.//td[1]/*/a/@href&#39;)[0]</span></span>
<span class="line"><span> date = i.xpath(&#39;.//td[2]/font/text()&#39;)</span></span>
<span class="line"><span> time = i.xpath(&#39;.//td[4]/font/span/font/text()&#39;)</span></span>
<span class="line"><span> time_complete = &quot; &quot;.join(date + time)</span></span>
<span class="line"><span> format_date = &#39;%m/%d/%Y %I:%M %p&#39;</span></span>
<span class="line"><span> d[&#39;real_date&#39;] = datetime.strptime(time_complete, format_date)</span></span>
<span class="line"><span> ppr(d)</span></span></code></pre></div><h1 id="reading-writing-csv-json" tabindex="-1">Reading &amp; Writing CSV/JSON <a class="header-anchor" href="#reading-writing-csv-json" aria-label="Permalink to “Reading &amp; Writing CSV/JSON”"></a></h1><hr></div></div></main><footer class="VPDocFooter" data-v-7011f0d8 data-v-e257564d><!--[--><!--]--><!----><nav class="prev-next" aria-labelledby="doc-footer-aria-label" data-v-e257564d><span class="visually-hidden" id="doc-footer-aria-label" data-v-e257564d>Pager</span><div class="pager" data-v-e257564d><a class="VPLink link pager-link prev" href="/rebel_coding/step2.html" data-v-e257564d><!--[--><span class="desc" data-v-e257564d>Previous page</span><span class="title" data-v-e257564d>Step 2: JavaScript</span><!--]--></a></div><div class="pager" data-v-e257564d><a class="VPLink link pager-link next" href="/rebel_coding/step4.html" data-v-e257564d><!--[--><span class="desc" data-v-e257564d>Next page</span><span class="title" data-v-e257564d>Step 4: The Full Stack</span><!--]--></a></div></nav></footer><!--[--><!--]--></div></div></div><!--[--><!--]--></div></div><!----><!--[--><!--]--></div></div>
<script>window.__VP_HASH_MAP__=JSON.parse("{\"api-examples.md\":\"CLsEsApn\",\"df_guide_1_basic_psych.md\":\"DwYn8aJz\",\"df_guide_2_medi_vibes.md\":\"BL-BuG9R\",\"df_guide_3_with_loneliness.md\":\"Bo6ULJ8y\",\"df_guide_4_enter_alchemy.md\":\"DcZx6NAy\",\"df_guide_5_test_ethic.md\":\"nyIQpJwM\",\"df_guide_6_social_physics.md\":\"CNA2jQCU\",\"df_guide_7_quantum_realm.md\":\"BFXlzHiT\",\"df_guide_8_topol.md\":\"BjXlxBWE\",\"df_guide_appendices.md\":\"CDMgMihK\",\"df_guide_index.md\":\"xD8OyHvK\",\"exocto_analytics.md\":\"BM3Pg91r\",\"exocto_appendices.md\":\"sVwSvUYe\",\"exocto_automation.md\":\"DEvrDE4y\",\"exocto_cooltools.md\":\"BC6_kCf0\",\"exocto_index.md\":\"DHdM3bQG\",\"exocto_licences.md\":\"Dk7QbXAY\",\"exocto_mailinglists.md\":\"D42Z_f3S\",\"exocto_payments.md\":\"BwzhBIvJ\",\"exocto_seo101.md\":\"CRJt4LE8\",\"exocto_socialmedia.md\":\"B9APVBs4\",\"exocto_websites.md\":\"6J92idVL\",\"index.1.md\":\"D7r5zJY5\",\"index.md\":\"DcciJnS6\",\"markdown-examples.md\":\"B9ZdJivL\",\"mempath_able-ism.md\":\"Dof--8TS\",\"mempath_appendices.md\":\"DBar2q2m\",\"mempath_gender-studies.md\":\"BcTGweUU\",\"mempath_index.md\":\"Dn2o6A3M\",\"mempath_notes.huggingface.md\":\"CGBrM3lA\",\"mempath_onward.md\":\"DxhCj56a\",\"mempath_openai.playground.md\":\"C3ceZL9Q\",\"mempath_outline.huggingface.md\":\"BFuiOEVO\",\"mempath_power-dynamics.md\":\"CQWNxegt\",\"mempath_racism.md\":\"CxVF4Hbn\",\"mempath_sexism.md\":\"Db1tqU4H\",\"rebel_coding_appendices.md\":\"gQ1t-5b7\",\"rebel_coding_index.md\":\"BgfENPxu\",\"rebel_coding_orientation.md\":\"BjaO66hJ\",\"rebel_coding_step1.md\":\"CgXIS7sX\",\"rebel_coding_step2.md\":\"C1zU29Lh\",\"rebel_coding_step3.md\":\"DPGBP964\",\"rebel_coding_step4.md\":\"Bp6KMb-K\",\"rebel_coding_step5.md\":\"Dpyx0NUO\",\"rebel_coding_step6.md\":\"VjnlTvFQ\",\"rebel_coding_step7.md\":\"Cv22hGLG\",\"rebel_coding_step8.md\":\"hfdnDA5D\",\"rebel_coding_termintro.md\":\"yqyXwJUh\",\"rebel_coding_v2.md\":\"C_Ab_PCg\",\"rebel_coding_v2orient.md\":\"Cih4N80W\",\"welcome_canin.md\":\"BVaWnSlf\",\"welcome_dreamfreely.md\":\"C8ZaDhcw\",\"welcome_index.md\":\"IohpDKGr\",\"welcome_support.md\":\"CIaYjdmV\"}");window.__VP_SITE_DATA__=JSON.parse("{\"lang\":\"en-US\",\"dir\":\"ltr\",\"title\":\"DreamFreely Library\",\"description\":\"Community Conscious Creations\",\"base\":\"/\",\"head\":[],\"router\":{\"prefetchLinks\":true},\"appearance\":true,\"themeConfig\":{\"nav\":[{\"text\":\"Welcome\",\"link\":\"/welcome/\"},{\"text\":\"Support\",\"link\":\"/welcome/support\"}],\"sidebar\":[{\"text\":\"Welcome\",\"items\":[{\"text\":\"Chibu / Hola / Hello\",\"link\":\"/welcome/\"},{\"text\":\"About DreamFreely\",\"link\":\"/welcome/dreamfreely\"},{\"text\":\"About Canin Carlos\",\"link\":\"/welcome/canin\"}]},{\"text\":\"Rebel Coding 101\",\"items\":[{\"text\":\"Intro to Coding\",\"link\":\"/rebel_coding/\"},{\"text\":\"Orientation\",\"link\":\"/rebel_coding/orientation\"},{\"text\":\"Intro to CLI\",\"link\":\"/rebel_coding/termintro\"},{\"text\":\"Step 1: HTML & CSS\",\"link\":\"/rebel_coding/step1\"},{\"text\":\"Step 2: JavaScript\",\"link\":\"/rebel_coding/step2\"},{\"text\":\"Step 3: Python Scrapers\",\"link\":\"/rebel_coding/step3\"},{\"text\":\"Step 4: The Full Stack\",\"link\":\"/rebel_coding/step4\"},{\"text\":\"Appendices\",\"link\":\"/rebel_coding/appendices\"}]},{\"text\":\"Rebel Coding 102\",\"items\":[{\"text\":\"Reviewing the Full-Stack\",\"link\":\"/rebel_coding/v2\"},{\"text\":\"Planning Our App\",\"link\":\"/rebel_coding/v2Orient\"},{\"text\":\"Step 1: Server\",\"link\":\"/rebel_coding/step5\"},{\"text\":\"Step 2: Client\",\"link\":\"/rebel_coding/step6\"},{\"text\":\"Step 3: Deployment\",\"link\":\"/rebel_coding/step7\"},{\"text\":\"Step 4: Scaling, etc\",\"link\":\"/rebel_coding/step8\"},{\"text\":\"More Appendices\",\"link\":\"/rebel_coding/more-appendices\"}]},{\"text\":\"Manifesting Empathy\",\"items\":[{\"text\":\"Welcome\",\"link\":\"/mempath/\"},{\"text\":\"The Egg (0-5)\",\"link\":\"/mempath/egg\"},{\"text\":\"The Hatchling (5-12)\",\"link\":\"/mempath/hatchling\"},{\"text\":\"Flight (12-18)\",\"link\":\
</body>
</html>