Last Updated: January 25, 2021
·
36
· kalinin84

Natural language processing

import re
import pandas as pd
import texthero as hero
from bs4 import BeautifulSoup
html = '''
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Hello, world!</title>
</head>
<body>
<p>alpha // Good health is above.</p>
<p>alpha // Wealth is nothing without health.</p>
<p>alpha // Happiness is nothing more than good health and a bad memory.</p>
<p>alpha // Health and cheerfulness mutually beget each other.</p>
<p>alpha // Health is not valued till sickness comes.</p>
<p>beta // Times change.</p>
<p>beta // Time is the great healer.</p>
<p>beta // Time cures all things.</p>
<p>beta // All in good time.</p>
<p>beta // Time is money.</p>
</body>
</html>
'''
soup = BeautifulSoup(html, 'html.parser')

raw = []
for p in soup.find_all('p'):
    match = re.search('(\w+) // (.+)', p.get_text())
    raw.append({'target': match.group(1), 'text': match.group(2)})
dataset = pd.DataFrame(raw)
dataset['target'] = dataset['target'].map({'alpha': 1, 'beta': 0})
dataset['pca'] = (
    dataset['text']
    .pipe(hero.clean)
    .pipe(hero.tfidf, max_features=2000)
    .pipe(hero.pca)
)
hero.scatterplot(dataset, col='pca', color='target', title='Example')