
class llama_context():
def __init__(self, path=None):
# code
def load_data(self):
self.documents = SimpleDirectoryReader(self.data_dir).load_data()
print(f"Documents loaded: {len(self.documents)}.")
def create_vector_store(self):
self.index = GPTVectorStoreIndex.from_documents(self.documents)
print("GPTVectorStoreIndex complete.")
def save_index(self):
self.index.storage_context.persist(persist_dir=self.perisit_dir)
print(f"Index saved in path {self.perisit_dir}.")
def load_index(self):
storage_context = StorageContext.from_defaults(persist_dir=self.perisit_dir)
self.index = load_index_from_storage(storage_context)
def start_query_engine(self):
self.query_engine = self.index.as_query_engine()
print("Query_engine started.")
def post_question(self, question, sleep = None):
# code
def del_data_dir(self):
# code
def copy_file_to_data_dir(self, file_extension ='.txt', verbose = 0):
# code
def copy_path_from_to_data_dir(self, path_from, file_extension ='.txt', verbose = 0):
# code
def estimate_tokens(self, text):
# code
def estimate_cost(self):
# code
# Create lct object from calls llama_context() with the working path
path_llama = "llama_mvp"
lct = llama_context(path=path_llama)
# Delete data directory
lct.del_data_dir()
# Copy files from source to data directory
path_from = "llama_mvp/source"
lct.copy_path_from_to_data_dir(path_from) # default extension *.txt
# Load documents
# Content "Bogdan was born in 1990"
lct.load_data()
# Vector create does embedding and costs tokens
lct.create_vector_store()
# Out:
# INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
# > [build_index_from_nodes] Total LLM token usage: 0 tokens
# INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 7 tokens
# > [build_index_from_nodes] Total embedding token usage: 7 tokens
# GPTVectorStoreIndex complete.
# Save index
lct.save_index()
# Method load_index() costs as method create_vector_store()
# so that you don't need to upload data every time
# The index is content knowledge
lct.load_index()
# Start query engine
lct.start_query_engine()
question = "What is content about?"
lct.post_question(question)
print(lct.response)
# Out:
# The content is about Bogdan and the year he was born.
question = "How old is he?"
# Out:
# Bogdan is 30 years old.
question = "What date is today?"
# Out:
# Today's date is August 8, 2020.
from datetime import date
today = date.today()
question = f"Consider current date {today}"
# Out:
# Consider current date 2023-05-15
# Bogdan is 33 years old.
question = "Where is the name commonly used as a given name?"
# Out:
# The name Bogdan is commonly used as a given name in Eastern European countries such as Romania, Bulgaria, and Ukraine.
Collected URL.
import urllib.request
import os
class collect_html():
def __init__(self):
pass
def read_save_html(self, url, path_save = None, filename = None, mode = 0):
# mode: 0 - save, 1 - content, 2 - save and content
response = urllib.request.urlopen(url)
html_file = response.read()
# code
if mode == 1 or mode == 2:
return html_file
# code
# List html files
repo_path = path_save
list_ipynb(repo_path, "html")
# URLs content is stored in files:
# llama_challenge\html_challenge\understanding_metrics_blog.html
# llama_challenge\html_challenge\challenge_20221107.html
# llama_challenge\html_challenge\challenge_20221128.html
# llama_challenge\html_challenge\challenge_20221222.html
# llama_challenge\html_challenge\hana_ml.dataframe.html
# llama_challenge\html_challenge\hana_ml.algorithms.pal.trees.HybridGradientBoostingClassifier.html
# https://github.com/SAP-samples/hana-ml-samples
# https://github.com/SAP-samples/hana-ml-samples/tree/main/Python-API/usecase-examples/sapcommunity-ha...
# folder = "Python-API\usecase-examples\sapcommunity-hanaml-challenge"
REPO_URL = "https://github.com/itsergiu/sapcommunity-hanaml-challenge"
DOCS_FOLDER = "llama_challenge/ipynb_blog"
!git clone $REPO_URL $DOCS_FOLDER
REPO_URL = "https://github.com/SAP-samples/hana-ml-samples"
DOCS_FOLDER = "llama_challenge/ipynb_hana_ml_samples"
!git clone $REPO_URL $DOCS_FOLDER
repo_path = "ipynb_blog"
list_ipynb(repo_path, "ipynb")
# ipynb_blog\SAP HANA ML challendge - CHURN v2.3 max.ipynb
repo_path = "ipynb_hana_ml_samples/Python-API/usecase-examples/sapcommunity-hanaml-challenge"
list_ipynb(repo_path, "ipynb")
# ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\10 Connectivity Check.ipynb
# ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\20 Data upload.ipynb
# ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\PAL Tutorial - Unified Classification Hybrid Gradient Boosting - PredictiveQuality Example.ipynb
# ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\Upload and explore Employee Churn data.ipynb
repo_path = "ipynb_blog"
list_ipynb(repo_path, "ipynb")
# ipynb_blog\SAP HANA ML challendge - CHURN v2.3 max.ipynb
class collect_ipynb():
def __init__(self):
pass
def ipynb_to_html(self, ipynb_file, path_save = None, encoding = None, content = False, verbose = 0):
# verbose: 0 - Completion, 1 - Source & Destination
# code
def ipynb_path_to_html(self, repo_path = None, path_save = None, encoding = None, verbose = 0):
# verbose: 0 - Complete message | 1 - Source file & Saved file
# code
# Converted notebooks are stored in files:
# Out:
# llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\10 Connectivity Check.html
# llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\20 Data upload.html
# llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\PAL Tutorial - Unified Classification Hybrid Gradient Boosting - PredictiveQuality Example.html
# llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\Upload and explore Employee Churn data.html
# Out:
# llama_challenge\ipynb_blog\SAP HANA ML challendge - CHURN v2.3 max.html
class collect_text():
def __init__(self, mask_ext = None):
# code
def open_html(self, html_file, encoding_read = None):
# code
def html_to_text(self, html_content):
# code
def html_to_text_file(self, html_file, path_save = None, content = False, verbose = 0, encoding_read=None, \
encoding_write = None):
# code
def html_path_to_text(self, repo_path = None, path_save = None, encoding_read = None, encoding_write = None, verbose = 0):
# code
# Converted files from HTML into TXT are stored in same location:
# Out:
llama_challenge\html_challenge\understanding_metrics_blog.txt
llama_challenge\html_challenge\challenge_20221107.txt
llama_challenge\html_challenge\challenge_20221128.txt
llama_challenge\html_challenge\challenge_20221222.txt
llama_challenge\html_challenge\hana_ml.dataframe.txt
llama_challenge\html_challenge\hana_ml.algorithms.pal.trees.HybridGradientBoostingClassifier.txt
# Out:
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\readme.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\10 Connectivity Check.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\20 Data upload.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\PAL Tutorial - Unified Classification Hybrid Gradient Boosting - PredictiveQuality Example.txt
llama_challenge\ipynb_hana_ml_samples\Python-API\usecase-examples\sapcommunity-hanaml-challenge\Upload and explore Employee Churn data.txt
# Out:
llama_challenge\ipynb_blog\SAP HANA ML challendge - CHURN v2.3 max.txt
# lct = llama_context(path='llama')
path_llama = "llama_challenge"
lct = llama_context(path=path_llama)
display(lct.path)
display(lct.data_dir)
display(lct.perisit_dir)
# Out:
# 'llama_challenge'
# 'llama_challenge\\data'
# 'llama_challenge\\storage'
path_from1 = "llama_challenge//html_challenge"
path_from2 = "llama_challenge//ipynb_blog"
path_from3 = "llama_challenge//ipynb_hana_ml_samples//Python-API//usecase-examples//sapcommunity-hanaml-challenge"
lct.copy_path_from_to_data_dir(path_from1) # default extension *.txt
lct.copy_path_from_to_data_dir(path_from2) # default extension *.txt
lct.copy_path_from_to_data_dir(path_from3) # default extension *.txt
# Converted files into TXT are saved in folders:
# html_challenge
# ipynb_blog
# ipynb_hana_ml_samples
lct.load_data()
# Out:
# Documents loaded: 12.
lct.estimate_cost()
# Out:
# Total estimated costs with model ada: $0.0175276
# Total estimated costs with model davinci: $1.31457
lct.create_vector_store()
# API key is required. Embedding cost tokens!
# https://platform.openai.com/account/api-keys
# Out:
# Total embedding token usage: 147741 tokens GPTVectorStoreIndex complete.
# https://platform.openai.com/account/usage
# Usage - $0.35
# text-embedding-ada-002-v2, 24 requests
# 103,950 prompt + 0 completion = 103,950 tokens
lct.save_index()
# Out:
# Index saved in path llama_challenge\storage.
lct.load_index()
# API key is required. Loading and embedding cost tokens!
# https://platform.openai.com/account/api-keys
# Out:
# Loading all indices.
lct.start_query_engine()
question = "What is content about?"
lct.post_question(question)
print(lct.response)
# Out:
The content is about SAP HANA and its related technologies, such as SAP HANA Cloud's Auto ML
capabilities, SAP HANA Python Client API for Machine Learning Algorithms, and SAP HANA Predictive
Analysis Library (PAL). It also includes information about a book related to SAP HANA and a blog post
about SAP HANA Machine Learning with ABAP Managed Database Procedures in SAP BW/4HANA.
question = "Who organized the Community Call?"
# Out:
The SAP HANA Cloud Machine Learning Challenge team organized the Community Call.
question = "What problem participants must solve?"
# Out:
Participants must solve the problem of predicting employee churn.
question = "Explain data for predicting employee churn"
# Out:
Data for predicting employee churn can include information about the employee such as their job title, years of experience, salary, performance reviews, and other factors that may influence their decision to stay or leave the company. Additionally, data can be collected from the company itself, such as the onboarding process, company culture, learning opportunities, and other factors that may influence employee churn. By analyzing this data, patterns can be identified that can help predict employee churn and inform decisions about how to improve the company's retention rate.
question = "Can you tell me which machine learning models were used by the participants?"
# Out:
The participants used the HybridGradientBoostingTree model for their machine learning.
question = "Which are the top 5 important features discoverd by the model?"
# Out:
The top 5 important features discovered by the model are: SICKDAYS, HRTRAINING, PREVIOUS_CAREER_PATH, LINKEDIN, and FUNCTIONALAREACHANGETYPE.
# QUESTION 1
question = "Python full code SAP HANA Machine learning HGBT example"
# Out:
he following code is an example of using the SAP HANA Python Client API for Machine Learning Algorithms to implement a HGBT (Hierarchical Gradient Boosting Tree) model.
# Import the necessary libraries
import hana_ml
from hana_ml.algorithms.apl.hgbt import HGBT
# Create a connection to the SAP HANA system
connection_context = hana_ml.dataframe.ConnectionContext(address='<hostname>:<port>',
user='<username>',
password='<password>')
# Load the data into a dataframe
df = connection_context.table('<schema>.<table>')
# Create the HGBT model
hgbt = HGBT(conn_context=connection_context)
# Fit the model
hgbt.fit(data=df, key='<key_column>', label='<label_column>')
# Make predictions
predictions = hgbt.predict(data=df)
# Evaluate the model
hgbt.evaluate(data=df, label='
# QUESTION 2
question = "Python full code hana_ml dataframe example"
# Out:
In [1]:
# Import the necessary libraries
import hana_ml
import pandas as pd
# Load the CSV file into a Python object (Pandas DataFrame)
df_data = pd.read_csv(r'Emp_Churn_Train.csv', sep = ',')
# Create a connection to the HANA system
connection_context = hana_ml.dataframe.ConnectionContext(address='<HANA_SYSTEM_ADDRESS>', port=<HANA_SYSTEM_PORT>, user='<HANA_SYSTEM_USER>', password='<HANA_SYSTEM_PASSWORD>')
# Create a dataframe object from the Pandas DataFrame
df_remote = connection_context.table('EMP_CHURN_TRAIN', schema='<HANA_SYSTEM_SCHEMA>', data=df_data)
# Create training and testing set
from hana_ml.algorithms.pal import partition
hdf_train, hdf_test, hdf_val = partition.train_test_val_split( random_seed = 1017
Execute notebook in your browser with Binder.
Execute notebook 05 SAP HANA Machine Learning content embedding v1.3.1.ipynb. It uses already embedded data in vector_store. Tokens are consumed only for questions.
The notebook 05 SAP HANA Machine Learning content embedding v1.3.ipynb is the full version with collecting documents from previous steps and creation of vector_store. Creation of vector_store costs tokens
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
3 | |
1 | |
1 | |
1 | |
1 | |
1 |