Last Updated: January 25, 2021
· kalinin84

Natural language processing

import re
import pandas as pd
import texthero as hero
from bs4 import BeautifulSoup
html = '''
<!doctype html>
<html lang="en">
<meta charset="utf-8">
<title>Hello, world!</title>
<p>alpha // Good health is above.</p>
<p>alpha // Wealth is nothing without health.</p>
<p>alpha // Happiness is nothing more than good health and a bad memory.</p>
<p>alpha // Health and cheerfulness mutually beget each other.</p>
<p>alpha // Health is not valued till sickness comes.</p>
<p>beta // Times change.</p>
<p>beta // Time is the great healer.</p>
<p>beta // Time cures all things.</p>
<p>beta // All in good time.</p>
<p>beta // Time is money.</p>
soup = BeautifulSoup(html, 'html.parser')

raw = []
for p in soup.find_all('p'):
    match ='(\w+) // (.+)', p.get_text())
    raw.append({'target':, 'text':})
dataset = pd.DataFrame(raw)
dataset['target'] = dataset['target'].map({'alpha': 1, 'beta': 0})
dataset['pca'] = (
    .pipe(hero.tfidf, max_features=2000)
hero.scatterplot(dataset, col='pca', color='target', title='Example')