mrncstt mrncstt

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, ArrayType


class ProfilePipeline:
    def __init__(self, app_name="mrncstt-profile"):
        self.spark = SparkSession.builder.appName(app_name).getOrCreate()

    def build_profile(self):
        schema = StructType([
            StructField("name", StringType()),
            StructField("role", StringType()),
            StructField("education", StringType()),
            StructField("interests", ArrayType(StringType())),
            StructField("stack", ArrayType(StringType())),
            StructField("digital_garden", StringType()),
            StructField("contact", StringType()),
        ])

        data = [(
            "Mariana Costa",
            "Data Engineer",
            "B.E. Production Engineering - UFRN (Brazil)",
            ["music", "HQ", "data literacy", "french"],
            ["PySpark", "Databricks", "SQL", "Python", "Power BI", "Tableau", "Qlik"],
            "mrncstt.github.io",
            "linkedin.com/in/mrncstt",
        )]

        self.df_profile = (
            self.spark.createDataFrame(data, schema=schema)
            .withColumn("interests", F.array_join("interests", ", "))
            .withColumn("stack", F.array_join("stack", " | "))
            .withColumn("updated_at", F.current_date())
        )

    def build_blog(self):
        schema = StructType([
            StructField("title", StringType()),
            StructField("url", StringType()),
        ])

        data = [
            ("Export your LinkedIn saved posts with Selenium and Beautiful Soup",
             "https://mrncstt.github.io/posts/export_linkedin_saved_posts_selenium_bs4/"),
            ("Seeking insights from a recording using Google Cloud Speech-to-Text, Google Colab and ChatGPT",
             "https://mrncstt.github.io/posts/seeking_insights_from_a_recording_using_google_cloud_speech_to_text_google_colab_and_chatgpt/"),
            ("Enhance your BI skills: people to follow for Tableau, Qlik, and Power BI",
             "https://mrncstt.github.io/posts/bi_people_follow/"),
            ("Buenos Aires for 33 days: a practical guide",
             "https://mrncstt.github.io/posts/tips_buenos_aires/"),
            ("Make Over Monday 2022 W/35",
             "https://mrncstt.github.io/posts/make_over_monday_2022_w_35/"),
            ("How do I keep myself updated about data, product management and productivity",
             "https://mrncstt.github.io/posts/stay-updated-data-product-management-productivity/"),
            ("Podcasts to learn French",
             "https://mrncstt.github.io/posts/podcasts_French/"),
            ("What I learned as a hackathon mentor",
             "https://mrncstt.github.io/posts/What_I_learned_from_being_a_mentor/"),
            ("Turning memorial comments into a Qlik word cloud",
             "https://mrncstt.github.io/posts/Word_Cloud_with_Qlik/"),
            ("What I learned from teaching my first course",
             "https://mrncstt.github.io/posts/What_I_Learned_from_Teaching/"),
        ]

        self.df_blog = (
            self.spark.createDataFrame(data, schema=schema)
            .withColumn("category", F.when(F.lower("title").contains("french"), "languages")
                .when(F.lower("title").contains("bi") | F.lower("title").contains("tableau")
                      | F.lower("title").contains("qlik") | F.lower("title").contains("power bi"), "data viz")
                .when(F.lower("title").contains("selenium") | F.lower("title").contains("speech-to-text"), "tech")
                .when(F.lower("title").contains("mentor") | F.lower("title").contains("teaching"), "career")
                .otherwise("misc"))
        )

    def display(self):
        display(self.df_profile)
        display(self.df_blog)

    def stop(self):
        self.spark.stop()


pipeline = ProfilePipeline()
pipeline.build_profile()
pipeline.build_blog()
pipeline.display()
pipeline.stop()

`display(pipeline.df_profile)`

name	role	education	interests	stack	digital_garden	contact	updated_at
Mariana Costa	Data Engineer	B.E. Production Engineering - UFRN (Brazil)	music, HQ, data literacy, french	PySpark \| Databricks \| SQL \| Python \| Power BI \| Tableau \| Qlik	mrncstt.github.io	linkedin.com/in/mrncstt	2026-02-06

`display(pipeline.df_blog)`

title	url	category
Export your LinkedIn saved posts with Selenium and Beautiful Soup	link	tech
Seeking insights from a recording using Google Cloud Speech-to-Text, Google Colab and ChatGPT	link	tech
Enhance your BI skills: people to follow for Tableau, Qlik, and Power BI	link	data viz
Buenos Aires for 33 days: a practical guide	link	misc
Make Over Monday 2022 W/35	link	misc
How do I keep myself updated about data, product management and productivity	link	misc
Podcasts to learn French	link	languages
What I learned as a hackathon mentor	link	career
Turning memorial comments into a Qlik word cloud	link	data viz
What I learned from teaching my first course	link	career

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

mrncstt mrncstt

Block or report mrncstt

`display(pipeline.df_profile)`

`display(pipeline.df_blog)`

Pinned Loading

Uh oh!