diff --git a/Usage.md b/Usage.md
index 9cf774e7..4371ca6b 100644
--- a/Usage.md
+++ b/Usage.md
@@ -149,6 +149,7 @@ TBD
 #### Changing a model
 See [here](pr_agent/algo/__init__.py) for the list of available models.
 
+#### Azure
 To use Azure, set: 
 ```
 api_key = "" # your azure api key
@@ -166,6 +167,30 @@ model="" # the OpenAI model you've deployed on Azure (e.g. gpt-3.5-turbo)
 ```
 in the configuration.toml 
 
+#### Huggingface
+
+To use a new model with Huggingface Inference Endpoints, for example, set:
+```
+[__init__.py]
+MAX_TOKENS = {
+    "model-name-on-huggingface": <max_tokens>
+}
+e.g.
+MAX_TOKENS={
+    ...,
+    "meta-llama/Llama-2-7b-chat-hf": 4096
+}
+[config] # in configuration.toml
+model = "huggingface/meta-llama/Llama-2-7b-chat-hf"
+
+[huggingface] # in .secrets.toml
+key = ... # your huggingface api key
+api_base = ... # the base url for your huggingface inference endpoint 
+```
+(you can obtain a Llama2 key from [here](https://replicate.com/replicate/llama-2-70b-chat/api))
+
+#### Replicate
+
 To use Llama2 model with Replicate, for example, set:
 ```
 [config] # in configuration.toml
@@ -175,6 +200,7 @@ key = ...
 ```
 (you can obtain a Llama2 key from [here](https://replicate.com/replicate/llama-2-70b-chat/api))
 
+
 Also review the [AiHandler](pr_agent/algo/ai_handler.py) file for instruction how to set keys for other models.
 
 #### Extra instructions
diff --git a/pr_agent/algo/__init__.py b/pr_agent/algo/__init__.py
index 798fc6c5..f7865250 100644
--- a/pr_agent/algo/__init__.py
+++ b/pr_agent/algo/__init__.py
@@ -11,4 +11,5 @@ MAX_TOKENS = {
     'claude-2': 100000,
     'command-nightly': 4096,
     'replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1': 4096,
+    'meta-llama/Llama-2-7b-chat-hf': 4096
 }
diff --git a/pr_agent/algo/ai_handler.py b/pr_agent/algo/ai_handler.py
index f5fc6722..b48924d6 100644
--- a/pr_agent/algo/ai_handler.py
+++ b/pr_agent/algo/ai_handler.py
@@ -6,7 +6,6 @@ from litellm import acompletion
 from openai.error import APIError, RateLimitError, Timeout, TryAgain
 from retry import retry
 from pr_agent.config_loader import get_settings
-
 OPENAI_RETRIES = 5
 
 
@@ -46,6 +45,8 @@ class AiHandler:
                 litellm.replicate_key = get_settings().replicate.key
             if get_settings().get("HUGGINGFACE.KEY", None):
                 litellm.huggingface_key = get_settings().huggingface.key
+                if get_settings().get("HUGGINGFACE.API_BASE", None):
+                    litellm.api_base = get_settings().huggingface.api_base
         except AttributeError as e:
             raise ValueError("OpenAI key is required") from e
 
diff --git a/pr_agent/settings/.secrets_template.toml b/pr_agent/settings/.secrets_template.toml
index 0ac75519..d4fef551 100644
--- a/pr_agent/settings/.secrets_template.toml
+++ b/pr_agent/settings/.secrets_template.toml
@@ -24,6 +24,11 @@ key = "" # Optional, uncomment if you want to use Cohere. Acquire through https:
 
 [replicate]
 key = "" # Optional, uncomment if you want to use Replicate. Acquire through https://replicate.com/
+
+[huggingface]
+key = "" # Optional, uncomment if you want to use Huggingface Inference API. Acquire through https://huggingface.co/docs/api-inference/quicktour
+api_base = "" # the base url for your huggingface inference endpoint 
+
 [github]
 # ---- Set the following only for deployment type == "user"
 user_token = ""  # A GitHub personal access token with 'repo' scope.